From bb11814a13ae9327287b966c1f49d40e5b478d5b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 12:49:39 -0400 Subject: [PATCH 01/15] Remove `Array` impl for `VariantArray` and `ShreddedVariantFieldArray`` --- .../benches/variant_kernels.rs | 2 +- parquet-variant-compute/src/lib.rs | 2 +- parquet-variant-compute/src/variant_array.rs | 427 +++++++++++++----- .../src/variant_array_builder.rs | 2 +- parquet-variant-compute/src/variant_get.rs | 231 +++++----- .../src/variant_to_arrow.rs | 6 +- parquet/tests/variant_integration.rs | 52 +-- 7 files changed, 438 insertions(+), 284 deletions(-) diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index 5e97f948b231..3cdb28229b8a 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -84,7 +84,7 @@ fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { pub fn variant_get_bench(c: &mut Criterion) { let variant_array = create_primitive_variant_array(8192); - let input: ArrayRef = Arc::new(variant_array); + let input = ArrayRef::from(variant_array); let options = GetOptions { path: vec![].into(), diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 70fcbdb66f95..26f7d6ca1636 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -45,7 +45,7 @@ mod variant_array_builder; pub mod variant_get; mod variant_to_arrow; -pub use variant_array::{ShreddingState, VariantArray}; +pub use variant_array::{ShreddingState, VariantArray, VariantType}; pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 4abffa65c23f..3dea8e48b0e4 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -18,36 +18,194 @@ //! [`VariantArray`] implementation use crate::type_conversion::primitive_conversion_single_value; -use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray}; +use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; +use arrow::compute::cast; use arrow::datatypes::{ Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; +use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; use parquet_variant::Uuid; use parquet_variant::Variant; -use std::any::Any; use std::sync::Arc; +/// Arrow Variant [`ExtensionType`]. +/// +/// Represents the canonical Arrow Extension Type for storing variants. +/// See [`VariantArray`] for more examples of using this extension type. +pub struct VariantType; + +impl ExtensionType for VariantType { + const NAME: &'static str = "parquet.variant"; + + // Variants have no extension metadata + type Metadata = (); + + fn metadata(&self) -> &Self::Metadata { + &() + } + + fn serialize_metadata(&self) -> Option { + None + } + + fn deserialize_metadata(_metadata: Option<&str>) -> Result { + Ok(()) + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + // Note don't check for metadata/value fields here because they may be + // absent in shredded variants + if matches!(data_type, DataType::Struct(_)) { + Ok(()) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "VariantType only supports StructArray, got {}", + data_type + ))) + } + } + + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + let new_self = Self; + new_self.supports_data_type(data_type)?; + Ok(new_self) + } +} + /// An array of Parquet [`Variant`] values /// /// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying /// `metadata` and `value` fields, and adds convenience methods to access -/// the `Variant`s +/// the [`Variant`]s. +/// +/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row. /// -/// See [`VariantArrayBuilder`] for constructing a `VariantArray`. +/// See the examples below from converting between `VariantArray` and +/// `StructArray`. /// /// [`VariantArrayBuilder`]: crate::VariantArrayBuilder /// -/// # Specification +/// # Documentation /// -/// 1. This code follows the conventions for storing variants in Arrow `StructArray` -/// defined by [Extension Type for Parquet Variant arrow] and this [document]. -/// At the time of this writing, this is not yet a standardized Arrow extension type. +/// At the time of this writing, Variant has been accepted as an official +/// extension type but not been published to the [official list of extension +/// types] on the Apache Arrow website. See the [Extension Type for Parquet +/// Variant arrow] ticket for more details. /// /// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908 -/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing +/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html +/// +/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension +/// +/// Arrow Arrays only provide [`DataType`], but the extension type information +/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or +/// [`Field`] to check for the extension type. +/// +/// [`Schema`]: arrow_schema::Schema +/// ``` +/// # use arrow::array::StructArray; +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// # fn get_schema() -> Schema { +/// # Schema::new(vec![ +/// # Field::new("id", DataType::Int32, false), +/// # get_variant_array().field("var"), +/// # ]) +/// # } +/// let schema = get_schema(); +/// assert_eq!(schema.fields().len(), 2); +/// // first field is not a Variant +/// assert!(schema.field(0).try_extension_type::().is_err()); +/// // second field is a Variant +/// assert!(schema.field(1).try_extension_type::().is_ok()); +/// ``` +/// +/// # Example: Constructing the correct [`Field`] for a [`VariantArray`] +/// +/// You can construct the correct [`Field`] for a [`VariantArray`] using the +/// [`VariantArray::field`] method. +/// +/// ``` +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// let variant_array = get_variant_array(); +/// // First field is an integer id, second field is a variant +/// let schema = Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// // call VariantArray::field to get the correct Field +/// variant_array.field("var"), +/// ]); +/// ``` +/// +/// You can also construct the [`Field`] using [`VariantType`] directly +/// +/// ``` +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// # let variant_array = get_variant_array(); +/// // The DataType of a VariantArray varies depending on how it is shredded +/// let data_type = variant_array.data_type().clone(); +/// // First field is an integer id, second field is a variant +/// let schema = Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// Field::new("var", data_type, false) +/// // Add extension metadata to the field using `VariantType` +/// .with_extension_type(VariantType), +/// ]); +/// ``` +/// +/// # Example: Converting a [`VariantArray`] to a [`StructArray`] +/// +/// ``` +/// # use arrow::array::StructArray; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::VariantArrayBuilder; +/// // Create Variant Array +/// let mut builder = VariantArrayBuilder::new(10); +/// builder.append_variant(Variant::from("such wow")); +/// let variant_array = builder.build(); +/// // convert to StructArray +/// let struct_array: StructArray = variant_array.into(); +/// ``` +/// +/// # Example: Converting a [`StructArray`] to a [`VariantArray`] +/// +/// ``` +/// # use arrow::array::StructArray; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray}; +/// # fn get_struct_array() -> StructArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build().into() +/// # } +/// let struct_array: StructArray = get_struct_array(); +/// // try and create a VariantArray from it +/// let variant_array = VariantArray::try_new(&struct_array).unwrap(); +/// assert_eq!(variant_array.value(0), Variant::from("such wow")); +/// ``` +/// #[derive(Debug)] pub struct VariantArray { /// Reference to the underlying StructArray @@ -88,7 +246,11 @@ impl VariantArray { /// int8. /// /// Currently, only [`BinaryViewArray`] are supported. - pub fn try_new(inner: ArrayRef) -> Result { + pub fn try_new(inner: &dyn Array) -> Result { + // Workaround lack of support for Binary + // https://github.com/apache/arrow-rs/issues/8387 + let inner = cast_to_binary_view_arrays(inner)?; + let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: requires StructArray as input".to_string(), @@ -246,6 +408,67 @@ impl VariantArray { pub fn typed_value_field(&self) -> Option<&ArrayRef> { self.shredding_state.typed_value_field() } + + /// Return a field to represent this VariantArray in a `Schema` with + /// a particular name + pub fn field(&self, name: impl Into) -> Field { + Field::new( + name.into(), + self.data_type().clone(), + self.inner.is_nullable(), + ) + .with_extension_type(VariantType) + } + + /// Returns a new DataType representing this VariantArray's inner type + pub fn data_type(&self) -> &DataType { + self.inner.data_type() + } + + pub fn slice(&self, offset: usize, length: usize) -> Self { + let inner = self.inner.slice(offset, length); + let metadata = self.metadata.slice(offset, length); + let shredding_state = self.shredding_state.slice(offset, length); + Self { + inner, + metadata, + shredding_state, + } + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn nulls(&self) -> Option<&NullBuffer> { + self.inner.nulls() + } + + /// Is the element at index null? + pub fn is_null(&self, index: usize) -> bool { + self.nulls().map(|n| n.is_null(index)).unwrap_or_default() + } + + /// Is the element at index valid (not null)? + pub fn is_valid(&self, index: usize) -> bool { + !self.is_null(index) + } +} + +impl From for StructArray { + fn from(variant_array: VariantArray) -> Self { + variant_array.into_inner() + } +} + +impl From for ArrayRef { + fn from(variant_array: VariantArray) -> Self { + Arc::new(variant_array.into_inner()) + } } /// One shredded field of a partially or prefectly shredded variant. For example, suppose the @@ -318,18 +541,25 @@ impl ShreddedVariantFieldArray { )); }; + let shredding_state = Self::shredding_state_from_struct_array(inner_struct)?; + Ok(Self { + inner: inner_struct.clone(), + shredding_state, + }) + } + + /// Creates a new `ShreddingState` from a [`StructArray`] representing a potentially + /// shredded variant. + pub(crate) fn shredding_state_from_struct_array( + inner_struct: &StructArray, + ) -> Result { // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) let value = inner_struct .column_by_name("value") .and_then(|col| col.as_binary_view_opt().cloned()); let typed_value = inner_struct.column_by_name("typed_value").cloned(); - // Note this clone is cheap, it just bumps the ref count - let inner = inner_struct.clone(); - Ok(Self { - inner: inner.clone(), - shredding_state: ShreddingState::try_new(value, typed_value)?, - }) + ShreddingState::try_new(value, typed_value) } /// Return the shredding state of this `VariantArray` @@ -351,59 +581,45 @@ impl ShreddedVariantFieldArray { pub fn inner(&self) -> &StructArray { &self.inner } -} - -impl Array for ShreddedVariantFieldArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn to_data(&self) -> ArrayData { - self.inner.to_data() - } - fn into_data(self) -> ArrayData { - self.inner.into_data() + /// Returns the inner [`StructArray`], consuming self + pub fn into_inner(self) -> StructArray { + self.inner } - fn data_type(&self) -> &DataType { + pub fn data_type(&self) -> &DataType { self.inner.data_type() } - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let inner = self.inner.slice(offset, length); - let shredding_state = self.shredding_state.slice(offset, length); - Arc::new(Self { - inner, - shredding_state, - }) - } - - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.inner.len() } - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { self.inner.is_empty() } - fn offset(&self) -> usize { + pub fn offset(&self) -> usize { self.inner.offset() } - fn nulls(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { // According to the shredding spec, ShreddedVariantFieldArray should be // physically non-nullable - SQL NULL is inferred by both value and // typed_value being physically NULL None } +} - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() +impl From for ArrayRef { + fn from(array: ShreddedVariantFieldArray) -> Self { + Arc::new(array.into_inner()) } +} - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() +impl From for StructArray { + fn from(array: ShreddedVariantFieldArray) -> Self { + array.into_inner() } } @@ -425,7 +641,7 @@ impl Array for ShreddedVariantFieldArray { /// | non-null | non-null | The value is present and is a partially shredded object | /// /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum ShreddingState { /// This variant has no typed_value field Unshredded { value: BinaryViewArray }, @@ -627,70 +843,57 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' } } -impl Array for VariantArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn to_data(&self) -> ArrayData { - self.inner.to_data() - } - - fn into_data(self) -> ArrayData { - self.inner.into_data() - } - - fn data_type(&self) -> &DataType { - self.inner.data_type() - } - - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let inner = self.inner.slice(offset, length); - let metadata = self.metadata.slice(offset, length); - let shredding_state = self.shredding_state.slice(offset, length); - Arc::new(Self { - inner, - metadata, - shredding_state, - }) - } - - fn len(&self) -> usize { - self.inner.len() - } - - fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - fn offset(&self) -> usize { - self.inner.offset() - } - - fn nulls(&self) -> Option<&NullBuffer> { - self.inner.nulls() - } - - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() - } +/// Workaround for lack of direct support for BinaryArray +/// +/// +/// The values are read as +/// * `StructArray` +/// +/// but VariantArray needs them as +/// * `StructArray` +/// +/// So cast them to get the right type. +fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { + let new_type = map_type(array.data_type()); + cast(array, &new_type) +} - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() +/// replaces all instances of Binary with BinaryView in a DataType +fn map_type(data_type: &DataType) -> DataType { + match data_type { + DataType::Binary => DataType::BinaryView, + DataType::List(field) => { + let new_field = field + .as_ref() + .clone() + .with_data_type(map_type(field.data_type())); + DataType::List(Arc::new(new_field)) + } + DataType::Struct(fields) => { + let new_fields: Fields = fields + .iter() + .map(|f| { + let new_field = f.as_ref().clone().with_data_type(map_type(f.data_type())); + Arc::new(new_field) + }) + .collect(); + DataType::Struct(new_fields) + } + _ => data_type.clone(), } } #[cfg(test)] mod test { use super::*; - use arrow::array::{BinaryArray, BinaryViewArray}; + use arrow::array::{BinaryViewArray, Int32Array}; use arrow_schema::{Field, Fields}; #[test] fn invalid_not_a_struct_array() { let array = make_binary_view_array(); // Should fail because the input is not a StructArray - let err = VariantArray::try_new(array); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), "Invalid argument error: Invalid VariantArray: requires StructArray as input" @@ -702,7 +905,7 @@ mod test { let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); // Should fail because the StructArray does not contain a 'metadata' field - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field" @@ -717,7 +920,7 @@ mod test { // NOTE: By strict spec interpretation, this case (top-level variant with null/null) // should be invalid, but we currently allow it and treat it as Variant::Null. // This is a pragmatic decision to handle missing data gracefully. - let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); + let variant_array = VariantArray::try_new(&array).unwrap(); // Verify the shredding state is AllNull assert!(matches!( @@ -736,18 +939,18 @@ mod test { #[test] fn invalid_metadata_field_type() { let fields = Fields::from(vec![ - Field::new("metadata", DataType::Binary, true), // Not yet supported + Field::new("metadata", DataType::Int32, true), // not supported Field::new("value", DataType::BinaryView, true), ]); let array = StructArray::new( fields, - vec![make_binary_array(), make_binary_view_array()], + vec![make_int32_array(), make_binary_view_array()], None, ); - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary" + "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32" ); } @@ -755,17 +958,17 @@ mod test { fn invalid_value_field_type() { let fields = Fields::from(vec![ Field::new("metadata", DataType::BinaryView, true), - Field::new("value", DataType::Binary, true), // Not yet supported + Field::new("value", DataType::Int32, true), // Not yet supported ]); let array = StructArray::new( fields, - vec![make_binary_view_array(), make_binary_array()], + vec![make_binary_view_array(), make_int32_array()], None, ); - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary" + "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32" ); } @@ -773,8 +976,8 @@ mod test { Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) } - fn make_binary_array() -> ArrayRef { - Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) + fn make_int32_array() -> ArrayRef { + Arc::new(Int32Array::from(vec![1])) } #[test] @@ -793,7 +996,7 @@ mod test { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls)); - let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + let variant_array = VariantArray::try_new(&struct_array).unwrap(); // Verify the shredding state is AllNull assert!(matches!( @@ -843,7 +1046,7 @@ mod test { None, // struct itself is not null, just the value field is all null ); - let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + let variant_array = VariantArray::try_new(&struct_array).unwrap(); // This should be Unshredded, not AllNull, because value field exists in schema assert!(matches!( diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 6451e3565802..68c1fd6b5492 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -133,7 +133,7 @@ impl VariantArrayBuilder { ); // TODO add arrow extension type metadata - VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction") + VariantArray::try_new(&inner).expect("valid VariantArray by construction") } /// Appends a null row to the builder. diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 0e111685169b..f9a38dfef728 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -27,11 +27,12 @@ use crate::variant_array::{ShreddedVariantFieldArray, ShreddingState}; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; +use arrow::array::AsArray; use std::sync::Arc; -pub(crate) enum ShreddedPathStep<'a> { +pub(crate) enum ShreddedPathStep { /// Path step succeeded, return the new shredding state - Success(&'a ShreddingState), + Success(ShreddingState), /// The path element is not present in the `typed_value` column and there is no `value` column, /// so we we know it does not exist. It, and all paths under it, are all-NULL. Missing, @@ -46,11 +47,11 @@ pub(crate) enum ShreddedPathStep<'a> { /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. /// /// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. -pub(crate) fn follow_shredded_path_element<'a>( - shredding_state: &'a ShreddingState, +pub(crate) fn follow_shredded_path_element( + shredding_state: &ShreddingState, path_element: &VariantPathElement<'_>, cast_options: &CastOptions, -) -> Result> { +) -> Result { // If the requested path element is not present in `typed_value`, and `value` is missing, then // we know it does not exist; it, and all paths under it, are all-NULL. let missing_path_step = || { @@ -87,20 +88,20 @@ pub(crate) fn follow_shredded_path_element<'a>( return Ok(missing_path_step()); }; - let field = field - .as_any() - .downcast_ref::() - .ok_or_else(|| { - // TODO: Should we blow up? Or just end the traversal and let the normal - // variant pathing code sort out the mess that it must anyway be - // prepared to handle? - ArrowError::InvalidArgumentError(format!( - "Expected a ShreddedVariantFieldArray, got {:?} instead", - field.data_type(), - )) - })?; - - Ok(ShreddedPathStep::Success(field.shredding_state())) + let struct_array = field.as_struct_opt().ok_or_else(|| { + // TODO: Should we blow up? Or just end the traversal and let the normal + // variant pathing code sort out the mess that it must anyway be + // prepared to handle? + ArrowError::InvalidArgumentError(format!( + "Expected Struct array while following path, got {}", + field.data_type(), + )) + })?; + + let shredding_state = + ShreddedVariantFieldArray::shredding_state_from_struct_array(struct_array)?; + + Ok(ShreddedPathStep::Success(shredding_state)) } VariantPathElement::Index { .. } => { // TODO: Support array indexing. Among other things, it will require slicing not @@ -154,11 +155,11 @@ fn shredded_get_path( // Peel away the prefix of path elements that traverses the shredded parts of this variant // column. Shredding will traverse the rest of the path on a per-row basis. - let mut shredding_state = input.shredding_state(); + let mut shredding_state = input.shredding_state().clone(); let mut accumulated_nulls = input.inner().nulls().cloned(); let mut path_index = 0; for path_element in path { - match follow_shredded_path_element(shredding_state, path_element, cast_options)? { + match follow_shredded_path_element(&shredding_state, path_element, cast_options)? { ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed if let Some(typed_value) = shredding_state.typed_value_field() { @@ -199,7 +200,7 @@ fn shredded_get_path( // If our caller did not request any specific type, we can just return whatever we landed on. let Some(as_field) = as_field else { - return Ok(Arc::new(target)); + return Ok(ArrayRef::from(target)); }; // Structs are special. Recurse into each field separately, hoping to follow the shredding even @@ -242,11 +243,7 @@ fn shredded_get_path( /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or /// list and then try to assemble the results. pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { - let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "expected a VariantArray as the input for variant_get".to_owned(), - ) - })?; + let variant_array = VariantArray::try_new(&input)?; let GetOptions { as_type, @@ -254,7 +251,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { cast_options, } = options; - shredded_get_path(variant_array, &path, as_type.as_deref(), &cast_options) + shredded_get_path(&variant_array, &path, as_type.as_deref(), &cast_options) } /// Controls the action of the variant_get kernel. @@ -303,9 +300,9 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, + Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -322,8 +319,7 @@ mod test { fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { // Create input array from JSON string let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); - let input_variant_array_ref: ArrayRef = - Arc::new(json_to_variant(&input_array_ref).unwrap()); + let input_variant_array_ref = ArrayRef::from(json_to_variant(&input_array_ref).unwrap()); let result = variant_get(&input_variant_array_ref, GetOptions::new_with_path(path)).unwrap(); @@ -332,7 +328,7 @@ mod test { let expected_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(expected_json)])); let expected_variant_array = json_to_variant(&expected_array_ref).unwrap(); - let result_array: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result_array = VariantArray::try_new(&result).unwrap(); assert_eq!( result_array.len(), 1, @@ -408,7 +404,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -487,7 +483,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -504,7 +500,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -521,7 +517,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -538,7 +534,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -593,7 +589,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 3); // Expect the values are the same as the original values @@ -675,7 +671,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 3); // All values should be null @@ -795,10 +791,9 @@ mod test { .with_field("typed_value", Arc::new(typed_value), true) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)) - .expect("should create variant array"), - ) + VariantArray::try_new(&struct_array) + .expect("should create variant array") + .into() } }; } @@ -926,10 +921,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)) - .expect("should create variant array"), - ) + Arc::new(struct_array) } }; } @@ -1017,7 +1009,7 @@ mod test { None, // row 3 is shredded, so no value ]); - let typed_value = arrow::array::BooleanArray::from(vec![ + let typed_value = BooleanArray::from(vec![ Some(true), // row 0 is shredded, so it has a value None, // row 1 is null, so no value None, // row 2 is a string, so no typed value @@ -1031,9 +1023,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for fixed size binary @@ -1077,7 +1067,7 @@ mod test { false, // row 2 is string true, // row 3 has value ]); - let typed_value = arrow::array::FixedSizeBinaryArray::try_new( + let typed_value = FixedSizeBinaryArray::try_new( 3, // byte width arrow::buffer::Buffer::from(data), Some(typed_value_nulls), @@ -1091,9 +1081,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for UTF8 @@ -1138,9 +1126,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for BinaryView @@ -1185,9 +1171,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents an "all null" variant @@ -1222,9 +1206,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// This test manually constructs a shredded variant array representing objects /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field @@ -1237,7 +1219,7 @@ mod test { let options = GetOptions::new_with_path(VariantPath::from("x")); let result = variant_get(&array, options).unwrap(); - let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); assert_eq!(result_variant.len(), 2); // Row 0: expect x=1 @@ -1325,7 +1307,7 @@ mod test { )]); let typed_value_struct = StructArray::try_new( typed_value_fields, - vec![Arc::new(x_field_shredded)], + vec![ArrayRef::from(x_field_shredded)], None, // No nulls - both rows have the object structure ) .unwrap(); @@ -1337,7 +1319,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + Arc::new(main_struct) } /// Simple test to check if nested paths are supported by current implementation @@ -1580,7 +1562,7 @@ mod test { } } - Arc::new(builder.build()) + builder.build().into() } /// Create test data for depth 1 (single nested field) @@ -1610,7 +1592,7 @@ mod test { } } - Arc::new(builder.build()) + builder.build().into() } /// Create test data for depth 2 (double nested field) @@ -1651,7 +1633,7 @@ mod test { } } - Arc::new(builder.build()) + builder.build().into() } /// Create simple shredded test data for depth 0 using a simplified working pattern @@ -1702,9 +1684,12 @@ mod test { x_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -1713,7 +1698,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create working depth 1 shredded test data based on the existing working pattern @@ -1799,8 +1784,12 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(a_inner_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(), + StructArray::try_new( + a_inner_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(), ), true, ) @@ -1815,9 +1804,12 @@ mod test { a_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(a_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(a_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -1826,7 +1818,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create working depth 2 shredded test data for "a.b.x" paths @@ -1903,8 +1895,12 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(b_inner_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(), + StructArray::try_new( + b_inner_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(), ), true, ) @@ -1936,8 +1932,12 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(a_inner_fields, vec![Arc::new(b_field_shredded)], None) - .unwrap(), + StructArray::try_new( + a_inner_fields, + vec![ArrayRef::from(b_field_shredded)], + None, + ) + .unwrap(), ), true, ) @@ -1952,9 +1952,12 @@ mod test { a_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(a_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(a_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -1963,7 +1966,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } #[test] @@ -1984,7 +1987,7 @@ mod test { cast_options: CastOptions::default(), // safe = true }; - let variant_array_ref: Arc = variant_array.clone(); + let variant_array_ref: Arc = variant_array.clone(); let result = variant_get(&variant_array_ref, safe_options); // Should succeed and return NULLs (safe behavior) assert!(result.is_ok()); @@ -2041,7 +2044,7 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = variant_array.clone(); + let variant_array_ref: Arc = variant_array.clone(); let result = variant_get(&variant_array_ref, options).unwrap(); // Verify the result length matches input @@ -2057,10 +2060,7 @@ mod test { ); // Verify the actual values - let int32_result = result - .as_any() - .downcast_ref::() - .unwrap(); + let int32_result = result.as_any().downcast_ref::().unwrap(); assert_eq!(int32_result.value(0), 55); // The valid Int32 value } @@ -2100,26 +2100,23 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result = variant_get(&variant_array_ref, options).unwrap(); // Verify the result is a StructArray - let struct_result = result - .as_any() - .downcast_ref::() - .unwrap(); + let struct_result = result.as_struct(); assert_eq!(struct_result.len(), 3); // Get the individual field arrays let field_a = struct_result .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let field_b = struct_result .column(1) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); // Verify field values and nulls @@ -2181,13 +2178,13 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result_nullable = variant_get(&variant_array_ref, options_nullable).unwrap(); // Verify we get an Int32Array with nulls for cast failures let int32_result = result_nullable .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(int32_result.len(), 9); @@ -2236,11 +2233,11 @@ mod test { // Create variant array again since we moved it let variant_array_2 = json_to_variant(&string_array).unwrap(); - let variant_array_ref_2: Arc = Arc::new(variant_array_2); + let variant_array_ref_2 = ArrayRef::from(variant_array_2); let result_non_nullable = variant_get(&variant_array_ref_2, options_non_nullable).unwrap(); let int32_result_2 = result_non_nullable .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); // Even with a non-nullable field, safe casting should still produce nulls for failures @@ -2553,7 +2550,7 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result = variant_get(&variant_array_ref, options); // Should fail with NotYetImplemented when the row builder tries to handle struct type @@ -2617,9 +2614,9 @@ mod test { let typed_value_struct = StructArray::try_new( typed_value_fields, vec![ - Arc::new(a_field_shredded), - Arc::new(b_field_shredded), - Arc::new(c_field_shredded), + ArrayRef::from(a_field_shredded), + ArrayRef::from(b_field_shredded), + ArrayRef::from(c_field_shredded), ], None, ) @@ -2632,7 +2629,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create comprehensive nested shredded variant with diverse null patterns @@ -2655,7 +2652,7 @@ mod test { false, // row 3: top-level NULL ]); let outer_typed_value = StructArrayBuilder::new() - .with_field("inner", Arc::new(inner), false) + .with_field("inner", ArrayRef::from(inner), false) .with_nulls(outer_typed_value_nulls) .build(); @@ -2671,7 +2668,7 @@ mod test { false, // row 3: top-level NULL ]); let typed_value = StructArrayBuilder::new() - .with_field("outer", Arc::new(outer), false) + .with_field("outer", ArrayRef::from(outer), false) .with_nulls(typed_value_nulls) .build(); @@ -2690,7 +2687,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create variant with mixed shredding (spec-compliant) including null scenarios @@ -2748,7 +2745,7 @@ mod test { // Create main typed_value struct (only contains shredded fields) let typed_value_struct = StructArrayBuilder::new() - .with_field("x", Arc::new(x_field_shredded), false) + .with_field("x", ArrayRef::from(x_field_shredded), false) .build(); // Build VariantArray with both value and typed_value (PartiallyShredded) @@ -2761,6 +2758,6 @@ mod test { .with_nulls(variant_nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } } diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index df9677edfb44..c32fc9b981ea 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -310,11 +310,13 @@ impl VariantToBinaryVariantArrowRowBuilder { } fn finish(mut self) -> Result { - Ok(Arc::new(VariantArray::from_parts( + let variant_array = VariantArray::from_parts( self.metadata, Some(self.builder.build()?), None, // no typed_value column self.nulls.finish(), - ))) + ); + + Ok(ArrayRef::from(variant_array)) } } diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 97fb6b880108..13f5afa2645d 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -24,15 +24,12 @@ //! Inspired by the arrow-go implementation: use arrow::util::test_util::parquet_test_data; -use arrow_array::{Array, ArrayRef}; -use arrow_cast::cast; -use arrow_schema::{DataType, Fields}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet_variant::{Variant, VariantMetadata}; use parquet_variant_compute::VariantArray; use serde::Deserialize; use std::path::Path; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; use std::{fs, path::PathBuf}; type Result = std::result::Result; @@ -399,57 +396,12 @@ impl VariantTestCase { .column_by_name("var") .unwrap_or_else(|| panic!("No 'var' column found in parquet file {path:?}")); - // the values are read as - // * StructArray - // but VariantArray needs them as - // * StructArray - // - // So cast them to get the right type. Hack Alert: the parquet reader - // should read them directly as BinaryView - let var = cast_to_binary_view_arrays(var); - - VariantArray::try_new(var).unwrap_or_else(|e| { + VariantArray::try_new(&var).unwrap_or_else(|e| { panic!("Error converting StructArray to VariantArray for {path:?}: {e}") }) } } -fn cast_to_binary_view_arrays(array: &ArrayRef) -> ArrayRef { - let new_type = map_type(array.data_type()); - cast(array, &new_type).unwrap_or_else(|e| { - panic!( - "Error casting array from {:?} to {:?}: {e}", - array.data_type(), - new_type - ) - }) -} - -/// replaces all instances of Binary with BinaryView in a DataType -fn map_type(data_type: &DataType) -> DataType { - match data_type { - DataType::Binary => DataType::BinaryView, - DataType::List(field) => { - let new_field = field - .as_ref() - .clone() - .with_data_type(map_type(field.data_type())); - DataType::List(Arc::new(new_field)) - } - DataType::Struct(fields) => { - let new_fields: Fields = fields - .iter() - .map(|f| { - let new_field = f.as_ref().clone().with_data_type(map_type(f.data_type())); - Arc::new(new_field) - }) - .collect(); - DataType::Struct(new_fields) - } - _ => data_type.clone(), - } -} - /// Variant value loaded from .variant.bin file #[derive(Debug, Clone)] struct ExpectedVariant { From 5cc767e8c0805eb373d6f22c1e7c4f1f66138c75 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 13:12:07 -0400 Subject: [PATCH 02/15] ArrayRef::from consistently --- parquet-variant-compute/src/variant_get.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index f9a38dfef728..792f4a9a17a0 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -1562,7 +1562,7 @@ mod test { } } - builder.build().into() + ArrayRef::from(builder.build()) } /// Create test data for depth 1 (single nested field) @@ -1592,7 +1592,7 @@ mod test { } } - builder.build().into() + ArrayRef::from(builder.build()) } /// Create test data for depth 2 (double nested field) @@ -1633,7 +1633,7 @@ mod test { } } - builder.build().into() + ArrayRef::from(builder.build()) } /// Create simple shredded test data for depth 0 using a simplified working pattern From 3e6b0cb8e570683c82c662b56958287ac57e72f1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 13:17:48 -0400 Subject: [PATCH 03/15] Avoid unecessary & --- parquet-variant-compute/src/variant_get.rs | 2 +- parquet/tests/variant_integration.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 792f4a9a17a0..5c3d4c155dcc 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -243,7 +243,7 @@ fn shredded_get_path( /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or /// list and then try to assemble the results. pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { - let variant_array = VariantArray::try_new(&input)?; + let variant_array = VariantArray::try_new(input)?; let GetOptions { as_type, diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 13f5afa2645d..a2ca20cea7af 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -396,7 +396,7 @@ impl VariantTestCase { .column_by_name("var") .unwrap_or_else(|| panic!("No 'var' column found in parquet file {path:?}")); - VariantArray::try_new(&var).unwrap_or_else(|e| { + VariantArray::try_new(var).unwrap_or_else(|e| { panic!("Error converting StructArray to VariantArray for {path:?}: {e}") }) } From 9b0e89fd30b1d40e9aebd9d481f18e6a8d78ea42 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 13:37:12 -0400 Subject: [PATCH 04/15] Introduce `ShreddingState::try_from` --- parquet-variant-compute/src/variant_array.rs | 48 ++++++++++++-------- parquet-variant-compute/src/variant_get.rs | 5 +- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 3dea8e48b0e4..4998ce7ca8f4 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -541,27 +541,12 @@ impl ShreddedVariantFieldArray { )); }; - let shredding_state = Self::shredding_state_from_struct_array(inner_struct)?; Ok(Self { inner: inner_struct.clone(), - shredding_state, + shredding_state: ShreddingState::try_from(inner_struct)?, }) } - /// Creates a new `ShreddingState` from a [`StructArray`] representing a potentially - /// shredded variant. - pub(crate) fn shredding_state_from_struct_array( - inner_struct: &StructArray, - ) -> Result { - // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) - let value = inner_struct - .column_by_name("value") - .and_then(|col| col.as_binary_view_opt().cloned()); - let typed_value = inner_struct.column_by_name("typed_value").cloned(); - - ShreddingState::try_new(value, typed_value) - } - /// Return the shredding state of this `VariantArray` pub fn shredding_state(&self) -> &ShreddingState { &self.shredding_state @@ -633,7 +618,7 @@ impl From for StructArray { /// single value. Values in the two fields must be interpreted according to the /// following table (see [Parquet Variant Shredding Spec] for more details): /// -/// | value | typed_value | Meaning | +/// | value | typed_value | Meaning | /// |----------|--------------|---------| /// | null | null | The value is missing; only valid for shredded object fields | /// | non-null | null | The value is present and may be any type, including `null` | @@ -672,7 +657,20 @@ pub enum ShreddingState { } impl ShreddingState { - /// try to create a new `ShreddingState` from the given fields + /// try to create a new `ShreddingState` from the given `value` and `typed_value` fields + /// + /// Note you can create a `ShreddingState` from a &[`StructArray`] using + /// `ShreddingState::try_from(&struct_array)`, for example: + /// + /// ```no_run + /// # use arrow::array::StructArray; + /// # use parquet_variant_compute::ShreddingState; + /// # fn get_struct_array() -> StructArray { + /// # unimplemented!() + /// # } + /// let struct_array: StructArray = get_struct_array(); + /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); + /// ``` pub fn try_new( value: Option, typed_value: Option, @@ -725,6 +723,20 @@ impl ShreddingState { } } +impl TryFrom<&StructArray> for ShreddingState { + type Error = ArrowError; + + fn try_from(inner_struct: &StructArray) -> Result { + // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) + let value = inner_struct + .column_by_name("value") + .and_then(|col| col.as_binary_view_opt().cloned()); + let typed_value = inner_struct.column_by_name("typed_value").cloned(); + + ShreddingState::try_new(value, typed_value) + } +} + /// Builds struct arrays from component fields /// /// TODO: move to arrow crate diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 5c3d4c155dcc..d4a5ed5669af 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -23,7 +23,7 @@ use arrow::{ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{VariantPath, VariantPathElement}; -use crate::variant_array::{ShreddedVariantFieldArray, ShreddingState}; +use crate::variant_array::ShreddingState; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; @@ -98,8 +98,7 @@ pub(crate) fn follow_shredded_path_element( )) })?; - let shredding_state = - ShreddedVariantFieldArray::shredding_state_from_struct_array(struct_array)?; + let shredding_state = ShreddingState::try_from(struct_array)?; Ok(ShreddedPathStep::Success(shredding_state)) } From 4b1daab75fbce48046c5730689ed599c7bba7ffe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 13:41:07 -0400 Subject: [PATCH 05/15] ShreddingState is infallable --- parquet-variant-compute/src/variant_array.rs | 34 ++++++++------------ parquet-variant-compute/src/variant_get.rs | 2 +- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 4998ce7ca8f4..ad369dd8c57f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -291,7 +291,7 @@ impl VariantArray { Ok(Self { inner: inner.clone(), metadata: metadata.clone(), - shredding_state: ShreddingState::try_new(value, typed_value)?, + shredding_state: ShreddingState::new(value, typed_value), }) } @@ -315,12 +315,10 @@ impl VariantArray { // This would be a lot simpler if ShreddingState were just a pair of Option... we already // have everything we need. - let inner = builder.build(); - let shredding_state = ShreddingState::try_new(value, typed_value).unwrap(); // valid by construction Self { - inner, + inner: builder.build(), metadata, - shredding_state, + shredding_state: ShreddingState::new(value, typed_value), } } @@ -543,7 +541,7 @@ impl ShreddedVariantFieldArray { Ok(Self { inner: inner_struct.clone(), - shredding_state: ShreddingState::try_from(inner_struct)?, + shredding_state: ShreddingState::from(inner_struct), }) } @@ -671,15 +669,12 @@ impl ShreddingState { /// let struct_array: StructArray = get_struct_array(); /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); /// ``` - pub fn try_new( - value: Option, - typed_value: Option, - ) -> Result { + pub fn new(value: Option, typed_value: Option) -> Self { match (value, typed_value) { - (Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded { value, typed_value }), - (Some(value), None) => Ok(Self::Unshredded { value }), - (None, Some(typed_value)) => Ok(Self::Typed { typed_value }), - (None, None) => Ok(Self::AllNull), + (Some(value), Some(typed_value)) => Self::PartiallyShredded { value, typed_value }, + (Some(value), None) => Self::Unshredded { value }, + (None, Some(typed_value)) => Self::Typed { typed_value }, + (None, None) => Self::AllNull, } } @@ -723,17 +718,14 @@ impl ShreddingState { } } -impl TryFrom<&StructArray> for ShreddingState { - type Error = ArrowError; - - fn try_from(inner_struct: &StructArray) -> Result { - // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) +impl From<&StructArray> for ShreddingState { + fn from(inner_struct: &StructArray) -> Self { let value = inner_struct .column_by_name("value") .and_then(|col| col.as_binary_view_opt().cloned()); let typed_value = inner_struct.column_by_name("typed_value").cloned(); - ShreddingState::try_new(value, typed_value) + ShreddingState::new(value, typed_value) } } @@ -994,7 +986,7 @@ mod test { #[test] fn all_null_shredding_state() { - let shredding_state = ShreddingState::try_new(None, None).unwrap(); + let shredding_state = ShreddingState::new(None, None); // Verify the shredding state is AllNull assert!(matches!(shredding_state, ShreddingState::AllNull)); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index d4a5ed5669af..5cabac9ed31b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -98,7 +98,7 @@ pub(crate) fn follow_shredded_path_element( )) })?; - let shredding_state = ShreddingState::try_from(struct_array)?; + let shredding_state = ShreddingState::from(struct_array); Ok(ShreddedPathStep::Success(shredding_state)) } From 61812ba067809ee152606179bb66c8dd21ec9055 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 19 Sep 2025 17:00:03 -0400 Subject: [PATCH 06/15] fix test --- parquet-variant-compute/src/variant_get.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 7236b144431c..2eb827f193cf 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -533,7 +533,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values From b0d9059c36a58847cdcab1ebb6188983c24749f1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 05:45:43 -0400 Subject: [PATCH 07/15] remove confusing comments --- parquet-variant-compute/src/variant_array.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index de7f4bf22db5..31c1d7924854 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -56,8 +56,6 @@ impl ExtensionType for VariantType { } fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { - // Note don't check for metadata/value fields here because they may be - // absent in shredded variants if matches!(data_type, DataType::Struct(_)) { Ok(()) } else { From 179e5f3d95c03c84fd370d3bef7341e35a0c3283 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 05:46:38 -0400 Subject: [PATCH 08/15] Simplify VariantType::try_new --- parquet-variant-compute/src/variant_array.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 31c1d7924854..f31a1170fb13 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -67,9 +67,8 @@ impl ExtensionType for VariantType { } fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { - let new_self = Self; - new_self.supports_data_type(data_type)?; - Ok(new_self) + Self.supports_data_type(data_type)?; + Ok(Self) } } From 32fbb3587ccb5f29eeba8ba022e5effc4b3c41c6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 05:48:33 -0400 Subject: [PATCH 09/15] Simplify is_null --- parquet-variant-compute/src/variant_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index f31a1170fb13..dd693854ecdd 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -443,7 +443,7 @@ impl VariantArray { /// Is the element at index null? pub fn is_null(&self, index: usize) -> bool { - self.nulls().map(|n| n.is_null(index)).unwrap_or_default() + self.nulls().is_some_and(|n| n.is_null(index)) } /// Is the element at index valid (not null)? @@ -612,7 +612,7 @@ impl ShreddedVariantFieldArray { } /// Is the element at index null? pub fn is_null(&self, index: usize) -> bool { - self.nulls().map(|n| n.is_null(index)).unwrap_or_default() + self.nulls().is_some_and(|n| n.is_null(index)) } /// Is the element at index valid (not null)? From ba820961f342a65f73d68ea51723ab91474f365f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 05:50:00 -0400 Subject: [PATCH 10/15] map_type --> rewrite_to_view --- parquet-variant-compute/src/variant_array.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index dd693854ecdd..6f5e08c4c400 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -891,26 +891,29 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' /// /// So cast them to get the right type. fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { - let new_type = map_type(array.data_type()); + let new_type = rewrite_to_view_types(array.data_type()); cast(array, &new_type) } /// replaces all instances of Binary with BinaryView in a DataType -fn map_type(data_type: &DataType) -> DataType { +fn rewrite_to_view_types(data_type: &DataType) -> DataType { match data_type { DataType::Binary => DataType::BinaryView, DataType::List(field) => { let new_field = field .as_ref() .clone() - .with_data_type(map_type(field.data_type())); + .with_data_type(rewrite_to_view_types(field.data_type())); DataType::List(Arc::new(new_field)) } DataType::Struct(fields) => { let new_fields: Fields = fields .iter() .map(|f| { - let new_field = f.as_ref().clone().with_data_type(map_type(f.data_type())); + let new_field = f + .as_ref() + .clone() + .with_data_type(rewrite_to_view_types(f.data_type())); Arc::new(new_field) }) .collect(); From ddca83626a134ce6eae13dda167ff476eeb572b0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 05:53:40 -0400 Subject: [PATCH 11/15] Improve formatting of rewrite_to_view_types --- parquet-variant-compute/src/variant_array.rs | 27 ++++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 6f5e08c4c400..c27f1c672bbd 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -899,30 +899,23 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result fn rewrite_to_view_types(data_type: &DataType) -> DataType { match data_type { DataType::Binary => DataType::BinaryView, - DataType::List(field) => { - let new_field = field - .as_ref() - .clone() - .with_data_type(rewrite_to_view_types(field.data_type())); - DataType::List(Arc::new(new_field)) - } + DataType::List(field) => DataType::List(rewrite_field_type(field)), DataType::Struct(fields) => { - let new_fields: Fields = fields - .iter() - .map(|f| { - let new_field = f - .as_ref() - .clone() - .with_data_type(rewrite_to_view_types(f.data_type())); - Arc::new(new_field) - }) - .collect(); + let new_fields: Fields = fields.iter().map(rewrite_field_type).collect(); DataType::Struct(new_fields) } _ => data_type.clone(), } } +fn rewrite_field_type(field: impl AsRef) -> Arc { + let field = field.as_ref(); + let new_field = field + .clone() + .with_data_type(rewrite_to_view_types(field.data_type())); + Arc::new(new_field) +} + #[cfg(test)] mod test { use super::*; From b871df450d85f4d7e68e990cc092a3601ef3b3a5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Sep 2025 06:00:40 -0400 Subject: [PATCH 12/15] Update parquet-variant-compute/src/variant_get.rs Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_get.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 2eb827f193cf..ef602e84f1bf 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -98,9 +98,7 @@ pub(crate) fn follow_shredded_path_element( )) })?; - let shredding_state = ShreddingState::from(struct_array); - - Ok(ShreddedPathStep::Success(shredding_state)) + Ok(ShreddedPathStep::Success(struct_array.into())) } VariantPathElement::Index { .. } => { // TODO: Support array indexing. Among other things, it will require slicing not From 8aaf81de72f4ab9c1125d9074d087dd7b6eb5d65 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 22 Sep 2025 13:26:57 -0400 Subject: [PATCH 13/15] Apply suggestions from code review Co-authored-by: Matthijs Brobbel --- parquet-variant-compute/src/variant_array.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c27f1c672bbd..4cad113d3a9a 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -38,7 +38,7 @@ use std::sync::Arc; pub struct VariantType; impl ExtensionType for VariantType { - const NAME: &'static str = "parquet.variant"; + const NAME: &'static str = "arrow.parquet.variant"; // Variants have no extension metadata type Metadata = (); @@ -60,8 +60,7 @@ impl ExtensionType for VariantType { Ok(()) } else { Err(ArrowError::InvalidArgumentError(format!( - "VariantType only supports StructArray, got {}", - data_type + "VariantType only supports StructArray, got {data_type}" ))) } } From 417ba6ba03a9761d2f7cde651f1ff7724e038210 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 22 Sep 2025 13:30:51 -0400 Subject: [PATCH 14/15] Use empty string for metadata --- parquet-variant-compute/src/variant_array.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c27f1c672bbd..30f4a4371cf4 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -40,19 +40,20 @@ pub struct VariantType; impl ExtensionType for VariantType { const NAME: &'static str = "parquet.variant"; - // Variants have no extension metadata - type Metadata = (); + // Variants extension metadata is an empty string + // + type Metadata = &'static str; fn metadata(&self) -> &Self::Metadata { - &() + &"" } fn serialize_metadata(&self) -> Option { - None + Some(String::new()) } fn deserialize_metadata(_metadata: Option<&str>) -> Result { - Ok(()) + Ok("") } fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { From 223d2b40d8bf6a8127be5a01001c14b6001e9865 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 22 Sep 2025 13:49:45 -0400 Subject: [PATCH 15/15] Update parquet-variant-compute/src/variant_array.rs Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c7a23d2f6c7d..ed4b6fe37e47 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -901,8 +901,7 @@ fn rewrite_to_view_types(data_type: &DataType) -> DataType { DataType::Binary => DataType::BinaryView, DataType::List(field) => DataType::List(rewrite_field_type(field)), DataType::Struct(fields) => { - let new_fields: Fields = fields.iter().map(rewrite_field_type).collect(); - DataType::Struct(new_fields) + DataType::Struct(fields.iter().map(rewrite_field_type).collect()) } _ => data_type.clone(), }