From 9c25cc4275cc916655b4aa2c649389073497383d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 15 Sep 2025 13:49:09 -0400 Subject: [PATCH 01/36] Add test shredded variant list array --- parquet-variant-compute/src/variant_get.rs | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 7774d136701f..1a5343164a35 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -985,7 +985,60 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } + /// Helper function to create a shredded variant array representing lists + /// + /// This creates an array that represents: + /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) + /// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded) + /// Row 2: ["comedy", "drama", "romance"] (perfectly shredded) + /// + /// The physical layout follows the shredding spec where: + /// - metadata: contains list metadata + /// - typed_value: StructArray with 0 index value + /// - value: contains fallback for + fn shredded_list_variant_array() -> ArrayRef { + // Create the base metadata for lists + + // Could add this as an api for VariantList, like VariantList::from() + fn build_list_metadata(vector: Vec) -> (Vec, Vec) { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut list = builder.new_list(); + for value in vector { + list.append_value(value); + } + list.finish(); + builder.finish() + } + let (metadata1, _) = + build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]); + + let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]); + + let (metadata3, _) = build_list_metadata(vec![ + Variant::String("comedy"), + Variant::String("drama"), + Variant::String("romance"), + ]); + + // Create metadata array + let metadata_array = + BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]); + + // Create the untyped value array + let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); + // Maybe I should try with an actual primitive array + let typed_value_array = StringArray::from(vec![ + "comedy", "drama", "horror", "comedy", "drama", "romance", + ]); + // Build the main VariantArray + let main_struct = crate::variant_array::StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array)) + .with_field("value", Arc::new(value_array)) + .with_field("typed_value", Arc::new(typed_value_array)) + .build(); + Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + } /// Helper function to create a shredded variant array representing objects /// /// This creates an array that represents: From ed961a46234a83724b534fa18a587e441b94b4b6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 15 Sep 2025 23:12:11 -0400 Subject: [PATCH 02/36] Add basic tests --- parquet-variant-compute/src/variant_get.rs | 43 +++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 1a5343164a35..5e1e62d64693 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -985,6 +985,42 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } + /// This test manually constructs a shredded variant array representing lists + /// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"] + /// as VariantArray using variant_get. + #[test] + fn test_shredded_list_field_access() { + let array = shredded_list_variant_array(); + + // Test: Extract the 0 index field as VariantArray first + let options = GetOptions::new_with_path(VariantPath::from(0)); + let result = variant_get(&array, options).unwrap(); + + let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result_variant.len(), 3); + + // Row 0: expect 0 index = "comedy" + assert_eq!(result_variant.value(0), Variant::String("comedy")); + // Row 1: expect 0 index = "horror" + assert_eq!(result_variant.value(1), Variant::String("horror")); + // Row 2: expect 0 index = "comedy" + assert_eq!(result_variant.value(2), Variant::String("comedy")); + } + /// Test extracting shredded list field with type conversion + #[test] + fn test_shredded_list_as_string() { + let array = shredded_list_variant_array(); + + // Test: Extract the 0 index values as StringArray (type conversion) + let field = Field::new("typed_value", DataType::Utf8, false); + let options = GetOptions::new_with_path(VariantPath::from(0)) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // Should get StringArray + let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")])); + assert_eq!(&result, &expected); + } /// Helper function to create a shredded variant array representing lists /// /// This creates an array that represents: @@ -1028,7 +1064,12 @@ mod test { let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); // Maybe I should try with an actual primitive array let typed_value_array = StringArray::from(vec![ - "comedy", "drama", "horror", "comedy", "drama", "romance", + Some("comedy"), + Some("drama"), + Some("horror"), + Some("comedy"), + Some("drama"), + Some("romance"), ]); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() From d53c8312e36b5124073b81988e02650bcf9ee994 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 17 Sep 2025 16:36:43 -0400 Subject: [PATCH 03/36] Redo test shredded array --- parquet-variant-compute/src/variant_get.rs | 142 ++++++++++++++------- 1 file changed, 93 insertions(+), 49 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 031db6b4e522..ad97e195b8b7 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -298,14 +298,15 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, + make_builder, Array, ArrayRef, BinaryBuilder, BinaryViewArray, Float16Array, Float32Array, + Float64Array, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int8Array, + StringArray, StringBuilder, StructArray, StructBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow_schema::{DataType, Field, FieldRef, Fields}; - use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; + use parquet_variant::{Variant, VariantBuilder, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; @@ -1090,7 +1091,7 @@ mod test { assert_eq!(&result, &expected); } /// This test manually constructs a shredded variant array representing lists - /// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"] + /// like ["comedy", "drama"] and ["horror", 123] /// as VariantArray using variant_get. #[test] fn test_shredded_list_field_access() { @@ -1102,13 +1103,11 @@ mod test { let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); assert_eq!(result_variant.len(), 3); - + // Row 0: expect 0 index = "comedy" - assert_eq!(result_variant.value(0), Variant::String("comedy")); + assert_eq!(result_variant.value(0), Variant::from("comedy")); // Row 1: expect 0 index = "horror" - assert_eq!(result_variant.value(1), Variant::String("horror")); - // Row 2: expect 0 index = "comedy" - assert_eq!(result_variant.value(2), Variant::String("comedy")); + assert_eq!(result_variant.value(1), Variant::from("horror")); } /// Test extracting shredded list field with type conversion #[test] @@ -1122,64 +1121,109 @@ mod test { let result = variant_get(&array, options).unwrap(); // Should get StringArray - let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")])); + let expected: ArrayRef = + Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")])); assert_eq!(&result, &expected); } /// Helper function to create a shredded variant array representing lists /// /// This creates an array that represents: /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) - /// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded) - /// Row 2: ["comedy", "drama", "romance"] (perfectly shredded) + /// Row 1: ["horror", 123] ([0] is shredded, [1] is int - partially shredded) /// /// The physical layout follows the shredding spec where: /// - metadata: contains list metadata /// - typed_value: StructArray with 0 index value /// - value: contains fallback for fn shredded_list_variant_array() -> ArrayRef { - // Create the base metadata for lists - - // Could add this as an api for VariantList, like VariantList::from() - fn build_list_metadata(vector: Vec) -> (Vec, Vec) { - let mut builder = parquet_variant::VariantBuilder::new(); - let mut list = builder.new_list(); - for value in vector { - list.append_value(value); - } - list.finish(); - builder.finish() - } - let (metadata1, _) = - build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]); + // Create metadata array + let metadata_array = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2)); - let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]); + // Building the typed_value ListArray - let (metadata3, _) = build_list_metadata(vec![ - Variant::String("comedy"), - Variant::String("drama"), - Variant::String("romance"), + // Need a StructBuilder to create a ListBuilder + let fields = Fields::from(vec![ + Field::new("value", DataType::Binary, true), + Field::new("typed_value", DataType::Utf8, true), ]); + let field_builders = vec![ + make_builder(&DataType::Binary, 4), + make_builder(&DataType::Utf8, 4), + ]; + let struct_builder = StructBuilder::new(fields, field_builders); + + let mut builder = GenericListBuilder::::new(struct_builder); + + // Row 0 index 0 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("comedy"); + builder.values().append(true); + + // Row 0 index 1 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("drama"); + builder.values().append(true); + + // Next row + builder.append(true); + + // Row 1 index 0 + builder + .values() + .field_builder::(0) + .unwrap() + .append_null(); + builder + .values() + .field_builder::(1) + .unwrap() + .append_value("horror"); + builder.values().append(true); + + // Row 1 index 1 + let mut variant_builder = VariantBuilder::new(); + variant_builder.append_value(123i32); // <------ couldn't find the right way to do it, used this as placeholder for binary + let (_, value) = variant_builder.finish(); + + builder + .values() + .field_builder::(0) + .unwrap() + .append_value(value); + builder + .values() + .field_builder::(1) + .unwrap() + .append_null(); + builder.values().append(true); + + // Next row + builder.append(true); + + let typed_value_array = builder.finish(); - // Create metadata array - let metadata_array = - BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]); - - // Create the untyped value array - let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]); - // Maybe I should try with an actual primitive array - let typed_value_array = StringArray::from(vec![ - Some("comedy"), - Some("drama"), - Some("horror"), - Some("comedy"), - Some("drama"), - Some("romance"), - ]); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata_array)) - .with_field("value", Arc::new(value_array)) - .with_field("typed_value", Arc::new(typed_value_array)) + .with_field("metadata", Arc::new(metadata_array), false) + // .with_field("value", Arc::new(value_array), true) + .with_field("typed_value", Arc::new(typed_value_array), true) .build(); Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) From 69de7d742302a39294be5123fa2f49c6d26c36e1 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 19 Sep 2025 14:54:54 -0400 Subject: [PATCH 04/36] Rebuild the shredded list array --- parquet-variant-compute/src/variant_get.rs | 123 +++++++-------------- 1 file changed, 40 insertions(+), 83 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fccf01b39fb9..d129b030c219 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -302,19 +302,19 @@ impl<'a> GetOptions<'a> { mod test { use std::sync::Arc; + use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ - make_builder, Array, ArrayRef, BinaryBuilder, BinaryViewArray, Float16Array, Float32Array, - Float64Array, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StringBuilder, StructArray, StructBuilder, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, + Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, + Int8Array, StringArray, StructArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; - use arrow::buffer::NullBuffer; + use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8}; use arrow_schema::{DataType, Field, FieldRef, Fields}; - use parquet_variant::{Variant, VariantBuilder, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; + use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; - use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::VariantArray; @@ -1314,82 +1314,39 @@ mod test { // Building the typed_value ListArray - // Need a StructBuilder to create a ListBuilder - let fields = Fields::from(vec![ - Field::new("value", DataType::Binary, true), - Field::new("typed_value", DataType::Utf8, true), - ]); - let field_builders = vec![ - make_builder(&DataType::Binary, 4), - make_builder(&DataType::Utf8, 4), - ]; - let struct_builder = StructBuilder::new(fields, field_builders); - - let mut builder = GenericListBuilder::::new(struct_builder); - - // Row 0 index 0 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("comedy"); - builder.values().append(true); - - // Row 0 index 1 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("drama"); - builder.values().append(true); - - // Next row - builder.append(true); - - // Row 1 index 0 - builder - .values() - .field_builder::(0) - .unwrap() - .append_null(); - builder - .values() - .field_builder::(1) - .unwrap() - .append_value("horror"); - builder.values().append(true); - - // Row 1 index 1 - let mut variant_builder = VariantBuilder::new(); - variant_builder.append_value(123i32); // <------ couldn't find the right way to do it, used this as placeholder for binary - let (_, value) = variant_builder.finish(); - - builder - .values() - .field_builder::(0) - .unwrap() - .append_value(value); - builder - .values() - .field_builder::(1) - .unwrap() - .append_null(); - builder.values().append(true); - - // Next row - builder.append(true); - - let typed_value_array = builder.finish(); + let mut variant_value_builder = VariantValueArrayBuilder::new(8); + variant_value_builder.append_null(); + variant_value_builder.append_null(); + variant_value_builder.append_null(); + variant_value_builder.append_value(Variant::from(123i32)); + + let struct_array = StructArrayBuilder::new() + .with_field( + "value", + Arc::new(variant_value_builder.build().unwrap()), + true, + ) + .with_field( + "typed_value", + Arc::new(StringArray::from(vec![ + Some("comedy"), + Some("drama"), + Some("horror"), + None, + ])), + true, + ) + .build(); + + let typed_value_array = GenericListArray::::new( + Arc::new(Field::new_list_field( + struct_array.data_type().clone(), + true, + )), + OffsetBuffer::from_lengths([2,2]), + Arc::new(struct_array), + None, + ); // Build the main VariantArray let main_struct = crate::variant_array::StructArrayBuilder::new() From cc6d7877fcf7009ce70f77d073c9ba328ed59614 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 23 Sep 2025 17:54:48 -0400 Subject: [PATCH 05/36] Use select::take to build the output array --- parquet-variant-compute/src/variant_get.rs | 70 ++++++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index d129b030c219..7e5894d80a61 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, - compute::CastOptions, + array::{ + self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, + UInt32Array, + }, + compute::{take, CastOptions}, datatypes::Field, error::Result, }; @@ -102,12 +105,56 @@ pub(crate) fn follow_shredded_path_element<'a>( Ok(ShreddedPathStep::Success(field.shredding_state())) } - VariantPathElement::Index { .. } => { + VariantPathElement::Index { index } => { // TODO: Support array indexing. Among other things, it will require slicing not // only the array we have here, but also the corresponding metadata and null masks. - Err(ArrowError::NotYetImplemented( - "Pathing into shredded variant array index".into(), - )) + let Some(list_array) = typed_value.as_any().downcast_ref::>()// <- shouldn't be just i64 + else { + // Downcast failure - if strict cast options are enabled, this should be an error + if !cast_options.safe { + return Err(ArrowError::CastError(format!( + "Cannot access index '{}' on non-list type: {}", + index, + typed_value.data_type() + ))); + } + // With safe cast options, return NULL (missing_path_step) + return Ok(missing_path_step()); + }; + + let offsets = list_array.offsets(); + let list_len = list_array.len(); // number of lists + let values = list_array.values(); // This is a StructArray + + let Some(struct_array) = values.as_any().downcast_ref::() else { + return Ok(missing_path_step()); + }; + + let Some(field_array) = struct_array.column_by_name("typed_value") else { + return Ok(missing_path_step()); + }; + + // Build the list of indices to take + let mut take_indices = Vec::with_capacity(list_len); + for i in 0..list_len { + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; + let len = end - start; + + if *index < len { + take_indices.push(Some((start + index) as u32)); + } else { + take_indices.push(None); + } + } + + let index_array = UInt32Array::from(take_indices); + + // Use Arrow compute kernel to gather elements + let taken = take(field_array, &index_array, None)?; + + let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?; + Ok(ShreddedPathStep::Success(&state)) } } } @@ -304,10 +351,9 @@ mod test { use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, - Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, StructArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, BinaryViewArray, Float16Array, Float32Array, Float64Array, + GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; @@ -1343,7 +1389,7 @@ mod test { struct_array.data_type().clone(), true, )), - OffsetBuffer::from_lengths([2,2]), + OffsetBuffer::from_lengths([2, 2]), Arc::new(struct_array), None, ); @@ -1411,7 +1457,7 @@ mod test { // Wrap the x field struct in a ShreddedVariantFieldArray let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) .expect("should create ShreddedVariantFieldArray"); - + // Create the main typed_value as a struct containing the "x" field let typed_value_fields = Fields::from(vec![Field::new( "x", From c0d20653716ee67b2e57f056328079d9524ea772 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 25 Sep 2025 17:09:30 -0400 Subject: [PATCH 06/36] Pass one test --- parquet-variant-compute/src/variant_get.rs | 68 ++++++++++++---------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 003ea69b9066..2fd5dc09fb25 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,18 +15,15 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{ - self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, - UInt32Array, - }, + array::{self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, UInt32Array}, compute::{take, CastOptions}, datatypes::Field, error::Result, }; use arrow_schema::{ArrowError, DataType, FieldRef}; -use parquet_variant::{VariantPath, VariantPathElement}; +use parquet_variant::{VariantPath, VariantPathElement, EMPTY_VARIANT_METADATA_BYTES}; -use crate::variant_array::ShreddingState; +use crate::variant_array::{ShreddingState, StructArrayBuilder}; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; @@ -106,7 +103,7 @@ pub(crate) fn follow_shredded_path_element( VariantPathElement::Index { index } => { // TODO: Support array indexing. Among other things, it will require slicing not // only the array we have here, but also the corresponding metadata and null masks. - let Some(list_array) = typed_value.as_any().downcast_ref::>()// <- shouldn't be just i64 + let Some(list_array) = typed_value.as_any().downcast_ref::>() else { // Downcast failure - if strict cast options are enabled, this should be an error if !cast_options.safe { @@ -121,20 +118,15 @@ pub(crate) fn follow_shredded_path_element( }; let offsets = list_array.offsets(); - let list_len = list_array.len(); // number of lists let values = list_array.values(); // This is a StructArray let Some(struct_array) = values.as_any().downcast_ref::() else { return Ok(missing_path_step()); }; - let Some(field_array) = struct_array.column_by_name("typed_value") else { - return Ok(missing_path_step()); - }; - // Build the list of indices to take - let mut take_indices = Vec::with_capacity(list_len); - for i in 0..list_len { + let mut take_indices = Vec::with_capacity(list_array.len()); + for i in 0..list_array.len() { let start = offsets[i] as usize; let end = offsets[i + 1] as usize; let len = end - start; @@ -149,10 +141,28 @@ pub(crate) fn follow_shredded_path_element( let index_array = UInt32Array::from(take_indices); // Use Arrow compute kernel to gather elements - let taken = take(field_array, &index_array, None)?; - - let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?; - Ok(ShreddedPathStep::Success(&state)) + let taken = take(struct_array, &index_array, None)?; + + let typed_array = taken + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("typed_value") + .unwrap() + .clone(); + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + taken.len(), + )); + + let struct_array = &StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array), false) + .with_field("typed_value", typed_array, true) + .build(); + + // let state = ShreddingState::new(None, Some(Arc::new(taken))); + Ok(ShreddedPathStep::Success(struct_array.into())) } } } @@ -346,8 +356,9 @@ mod test { use crate::{json_to_variant, VariantValueArrayBuilder}; use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray, - Float16Array, GenericListArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Float16Array, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, + Int64Array, Int8Array, StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; @@ -1356,15 +1367,13 @@ mod test { /// like ["comedy", "drama"] and ["horror", 123] /// as VariantArray using variant_get. #[test] - fn test_shredded_list_field_access() { + fn test_shredded_list_index_access() { let array = shredded_list_variant_array(); - // Test: Extract the 0 index field as VariantArray first let options = GetOptions::new_with_path(VariantPath::from(0)); let result = variant_get(&array, options).unwrap(); - - let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); - assert_eq!(result_variant.len(), 3); + let result_variant = VariantArray::try_new(&result).unwrap(); + assert_eq!(result_variant.len(), 2); // Row 0: expect 0 index = "comedy" assert_eq!(result_variant.value(0), Variant::from("comedy")); @@ -1375,16 +1384,13 @@ mod test { #[test] fn test_shredded_list_as_string() { let array = shredded_list_variant_array(); - // Test: Extract the 0 index values as StringArray (type conversion) let field = Field::new("typed_value", DataType::Utf8, false); let options = GetOptions::new_with_path(VariantPath::from(0)) .with_as_type(Some(FieldRef::from(field))); let result = variant_get(&array, options).unwrap(); - // Should get StringArray - let expected: ArrayRef = - Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")])); + let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); assert_eq!(&result, &expected); } /// Helper function to create a shredded variant array representing lists @@ -1445,7 +1451,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_array), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + Arc::new(main_struct) } /// Helper function to create a shredded variant array representing objects /// @@ -1501,7 +1507,7 @@ mod test { // Wrap the x field struct in a ShreddedVariantFieldArray let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray"); - + // Create the main typed_value as a struct containing the "x" field let typed_value_fields = Fields::from(vec![Field::new( "x", From 40b631181fba421d6a3e038abc16d8bb7a7d94ca Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 25 Sep 2025 19:11:07 -0400 Subject: [PATCH 07/36] Get typed values directly --- parquet-variant-compute/src/variant_get.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 136a2d515054..d664bb4e4707 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -124,6 +124,10 @@ pub(crate) fn follow_shredded_path_element( return Ok(missing_path_step()); }; + let Some(typed_array) = struct_array.column_by_name("typed_value") else { + return Ok(missing_path_step()) + }; + // Build the list of indices to take let mut take_indices = Vec::with_capacity(list_array.len()); for i in 0..list_array.len() { @@ -141,15 +145,7 @@ pub(crate) fn follow_shredded_path_element( let index_array = UInt32Array::from(take_indices); // Use Arrow compute kernel to gather elements - let taken = take(struct_array, &index_array, None)?; - - let typed_array = taken - .as_any() - .downcast_ref::() - .unwrap() - .column_by_name("typed_value") - .unwrap() - .clone(); + let taken = take(typed_array, &index_array, None)?; let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, @@ -158,10 +154,9 @@ pub(crate) fn follow_shredded_path_element( let struct_array = &StructArrayBuilder::new() .with_field("metadata", Arc::new(metadata_array), false) - .with_field("typed_value", typed_array, true) + .with_field("typed_value", taken, true) .build(); - // let state = ShreddingState::new(None, Some(Arc::new(taken))); Ok(ShreddedPathStep::Success(struct_array.into())) } } From f6e88ef2d2efbedb0fd4a0a5aa36b02ee955c857 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 13 Oct 2025 14:57:51 -0400 Subject: [PATCH 08/36] Added support for utf8, largeUtf8, utf8view --- parquet-variant-compute/src/variant_get.rs | 1 + .../src/variant_to_arrow.rs | 105 +++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 8ee489cfe583..061d5ad0eef1 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -138,6 +138,7 @@ fn shredded_get_path( as_type, cast_options, target.len(), + target.inner().get_buffer_memory_size(), )?; for i in 0..target.len() { if target.is_null(i) { diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index d60a4eea05c0..14f45d3c8d50 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,7 +16,8 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, builder::BooleanBuilder, + ArrayRef, BinaryViewArray, LargeStringBuilder, NullBufferBuilder, PrimitiveBuilder, + StringBuilder, StringViewBuilder, builder::BooleanBuilder, }; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; @@ -52,6 +53,12 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), + StringView(VariantToUtf8ViewArrowBuilder<'a>), +} + +pub(crate) enum StringVariantToArrowRowBuilder<'a> { + Utf8(VariantToUtf8ArrowRowBuilder<'a>), + LargeUtf8(VariantToLargeUtf8ArrowBuilder<'a>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -61,7 +68,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { pub(crate) enum VariantToArrowRowBuilder<'a> { Primitive(PrimitiveVariantToArrowRowBuilder<'a>), BinaryVariant(VariantToBinaryVariantArrowRowBuilder), - + String(StringVariantToArrowRowBuilder<'a>), // Path extraction wrapper - contains a boxed enum for any of the above WithPath(VariantPathRowBuilder<'a>), } @@ -87,6 +94,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_null(), TimestampNanoNtz(b) => b.append_null(), Date(b) => b.append_null(), + StringView(b) => b.append_null(), } } @@ -110,6 +118,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_value(value), TimestampNanoNtz(b) => b.append_value(value), Date(b) => b.append_value(value), + StringView(b) => b.append_value(value), } } @@ -133,6 +142,33 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.finish(), TimestampNanoNtz(b) => b.finish(), Date(b) => b.finish(), + StringView(b) => b.finish(), + } + } +} + +impl<'a> StringVariantToArrowRowBuilder<'a> { + pub fn append_null(&mut self) -> Result<()> { + use StringVariantToArrowRowBuilder::*; + match self { + Utf8(b) => b.append_null(), + LargeUtf8(b) => b.append_null(), + } + } + + pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { + use StringVariantToArrowRowBuilder::*; + match self { + Utf8(b) => b.append_value(value), + LargeUtf8(b) => b.append_value(value), + } + } + + pub fn finish(self) -> Result { + use StringVariantToArrowRowBuilder::*; + match self { + Utf8(b) => b.finish(), + LargeUtf8(b) => b.finish(), } } } @@ -142,6 +178,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.append_null(), + String(b) => b.append_null(), BinaryVariant(b) => b.append_null(), WithPath(path_builder) => path_builder.append_null(), } @@ -151,6 +188,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.append_value(&value), + String(b) => b.append_value(&value), BinaryVariant(b) => b.append_value(value), WithPath(path_builder) => path_builder.append_value(value), } @@ -160,6 +198,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.finish(), + String(b) => b.finish(), BinaryVariant(b) => b.finish(), WithPath(path_builder) => path_builder.finish(), } @@ -236,6 +275,9 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), + DataType::Utf8View => { + StringView(VariantToUtf8ViewArrowBuilder::new(cast_options, capacity)) + } _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -250,12 +292,41 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( Ok(builder) } +pub(crate) fn make_string_variant_to_arrow_row_builder<'a>( + data_type: &'a DataType, + cast_options: &'a CastOptions, + item_capacity: usize, + data_capacity: usize, +) -> Result> { + use StringVariantToArrowRowBuilder::{LargeUtf8, Utf8}; + + let builder = match data_type { + DataType::Utf8 => Utf8(VariantToUtf8ArrowRowBuilder::new( + cast_options, + item_capacity, + data_capacity, + )), + DataType::LargeUtf8 => LargeUtf8(VariantToLargeUtf8ArrowBuilder::new( + cast_options, + item_capacity, + data_capacity, + )), + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "Not a string type: {data_type:?}" + ))); + } + }; + Ok(builder) +} + pub(crate) fn make_variant_to_arrow_row_builder<'a>( metadata: &BinaryViewArray, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, capacity: usize, + data_capacity: usize, ) -> Result> { use VariantToArrowRowBuilder::*; @@ -265,6 +336,15 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( metadata.clone(), capacity, )), + Some(dt @ (DataType::Utf8 | DataType::LargeUtf8)) => { + let builder = make_string_variant_to_arrow_row_builder( + dt, + cast_options, + capacity, + data_capacity, + )?; + String(builder) + } Some(DataType::Struct(_)) => { return Err(ArrowError::NotYetImplemented( "Converting unshredded variant objects to arrow structs".to_string(), @@ -401,6 +481,27 @@ macro_rules! define_variant_to_primitive_builder { } } +define_variant_to_primitive_builder!( + struct VariantToUtf8ArrowRowBuilder<'a> + |item_capacity, data_capacity: usize| -> StringBuilder {StringBuilder::with_capacity(item_capacity, data_capacity)}, + |value| value.as_string(), + type_name: "Utf8" +); + +define_variant_to_primitive_builder!( + struct VariantToLargeUtf8ArrowBuilder<'a> + |item_capacity, data_capacity: usize| -> LargeStringBuilder {LargeStringBuilder::with_capacity(item_capacity, data_capacity)}, + |value| value.as_string(), + type_name: "LargeUtf8" +); + +define_variant_to_primitive_builder!( + struct VariantToUtf8ViewArrowBuilder<'a> + |capacity| -> StringViewBuilder {StringViewBuilder::with_capacity(capacity)}, + |value| value.as_string(), + type_name: "Utf8View" +); + define_variant_to_primitive_builder!( struct VariantToBooleanArrowRowBuilder<'a> |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) }, From 61ed1783396a9bd4aabbc1a9cfd00538eb7a8ec3 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 13 Oct 2025 16:47:39 -0400 Subject: [PATCH 09/36] added tests for utf8, largeUtf8, utf8view --- parquet-variant-compute/src/variant_get.rs | 11 +++ .../src/variant_to_arrow.rs | 81 +++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 061d5ad0eef1..c98f4b75d74a 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -764,6 +764,13 @@ mod test { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) ); + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + macro_rules! perfectly_shredded_variant_array_fn { ($func:ident, $typed_value_gen:expr) => { fn $func() -> ArrayRef { @@ -787,6 +794,10 @@ mod test { }; } + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_variant_array, || { + StringArray::from(vec![Some("foo"), Some("baz"), Some("bar")]) + }); + perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) }); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 14f45d3c8d50..1ab194b905f6 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -573,3 +573,84 @@ impl VariantToBinaryVariantArrowRowBuilder { Ok(ArrayRef::from(variant_array)) } } + +#[cfg(test)] +mod test { + use crate::variant_to_arrow::make_variant_to_arrow_row_builder; + use arrow::array::{Array, BinaryViewArray, LargeStringArray, StringViewArray}; + use arrow::compute::CastOptions; + use arrow::datatypes::DataType; + use parquet_variant::EMPTY_VARIANT_METADATA_BYTES; + use parquet_variant::{Variant, VariantPath}; + use std::sync::Arc; + + #[test] + fn test_large_utf8_variant_to_large_utf8_arrow() { + let metadata = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); + let cast_options = CastOptions::default(); + let path = VariantPath::default(); + let capacity = 3; + let data_capacity = 9usize; + let data_type = Some(&DataType::LargeUtf8); + + let mut builder = make_variant_to_arrow_row_builder( + &metadata, + path, + data_type, + &cast_options, + capacity, + data_capacity, + ) + .unwrap(); + + builder.append_value(Variant::from("foo")).unwrap(); + builder.append_value(Variant::from("bar")).unwrap(); + builder.append_value(Variant::from("baz")).unwrap(); + + let output = builder.finish().unwrap(); + + let expected = Arc::new(LargeStringArray::from(vec![ + Some("foo"), + Some("bar"), + Some("baz"), + ])); + + assert_eq!(&output.to_data(), &expected.to_data()); + } + + #[test] + fn test_utf8_view_variant_to_utf8_view_arrow() { + let metadata = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); + let cast_options = CastOptions::default(); + let path = VariantPath::default(); + let capacity = 3; + let data_capacity = 9usize; + let data_type = Some(&DataType::Utf8View); + + let mut builder = make_variant_to_arrow_row_builder( + &metadata, + path, + data_type, + &cast_options, + capacity, + data_capacity, + ) + .unwrap(); + + builder.append_value(Variant::from("foo")).unwrap(); + builder.append_value(Variant::from("bar")).unwrap(); + builder.append_value(Variant::from("baz")).unwrap(); + + let output = builder.finish().unwrap(); + + let expected = Arc::new(StringViewArray::from(vec![ + Some("foo"), + Some("bar"), + Some("baz"), + ])); + + assert_eq!(&output.to_data(), &expected.to_data()); + } +} From 1fb612d4107c23947cb514b2f4dc5136b68a7caf Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 13 Oct 2025 17:05:17 -0400 Subject: [PATCH 10/36] fix tests --- parquet-variant-compute/src/variant_get.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index c98f4b75d74a..66dfa3958729 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -795,7 +795,7 @@ mod test { } perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_variant_array, || { - StringArray::from(vec![Some("foo"), Some("baz"), Some("bar")]) + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) }); perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { From 2b6d280fb0439ff2bc6fbf5892e2f6f98d94df16 Mon Sep 17 00:00:00 2001 From: Kosta Tarasov <33369833+sdf-jkl@users.noreply.github.com> Date: Tue, 14 Oct 2025 09:59:43 -0400 Subject: [PATCH 11/36] Update parquet-variant-compute/src/variant_to_arrow.rs Co-authored-by: Congxian Qiu --- parquet-variant-compute/src/variant_to_arrow.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 1ab194b905f6..9120335b537a 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -483,7 +483,7 @@ macro_rules! define_variant_to_primitive_builder { define_variant_to_primitive_builder!( struct VariantToUtf8ArrowRowBuilder<'a> - |item_capacity, data_capacity: usize| -> StringBuilder {StringBuilder::with_capacity(item_capacity, data_capacity)}, + |item_capacity, data_capacity: usize| -> StringBuilder { StringBuilder::with_capacity(item_capacity, data_capacity) }, |value| value.as_string(), type_name: "Utf8" ); From defa07bce0f647f109f8dab04041f6bb2676cd59 Mon Sep 17 00:00:00 2001 From: Kosta Tarasov <33369833+sdf-jkl@users.noreply.github.com> Date: Mon, 20 Oct 2025 10:10:17 -0400 Subject: [PATCH 12/36] Update parquet-variant-compute/src/variant_to_arrow.rs Co-authored-by: Ryan Johnson --- .../src/variant_to_arrow.rs | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 9120335b537a..433d8af1984d 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -482,24 +482,10 @@ macro_rules! define_variant_to_primitive_builder { } define_variant_to_primitive_builder!( - struct VariantToUtf8ArrowRowBuilder<'a> - |item_capacity, data_capacity: usize| -> StringBuilder { StringBuilder::with_capacity(item_capacity, data_capacity) }, + struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder> + |capacity| -> B { B::with_capacity(capacity) }, |value| value.as_string(), - type_name: "Utf8" -); - -define_variant_to_primitive_builder!( - struct VariantToLargeUtf8ArrowBuilder<'a> - |item_capacity, data_capacity: usize| -> LargeStringBuilder {LargeStringBuilder::with_capacity(item_capacity, data_capacity)}, - |value| value.as_string(), - type_name: "LargeUtf8" -); - -define_variant_to_primitive_builder!( - struct VariantToUtf8ViewArrowBuilder<'a> - |capacity| -> StringViewBuilder {StringViewBuilder::with_capacity(capacity)}, - |value| value.as_string(), - type_name: "Utf8View" + type_name: B::type_name() ); define_variant_to_primitive_builder!( From 5022acd24a333c76ac2f978a74bbd74f2605702a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 20 Oct 2025 12:49:25 -0400 Subject: [PATCH 13/36] Support LargeUtf8, Utf8-View --- .../src/builder/generic_bytes_builder.rs | 34 ++++ .../src/builder/generic_bytes_view_builder.rs | 17 +- parquet-variant-compute/src/variant_array.rs | 14 +- parquet-variant-compute/src/variant_get.rs | 25 ++- .../src/variant_to_arrow.rs | 175 ++---------------- 5 files changed, 102 insertions(+), 163 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 913a440ca747..5a482f4c254d 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -348,6 +348,40 @@ impl std::fmt::Write for GenericStringBuilder { } } +const AVERAGE_STRING_LENGTH: usize = 16; +/// Trait for string-like array builders +/// +/// This trait provides unified interface for builders that append string-like data +/// such as [`GenericStringBuilder`] and [`crate::builder::StringViewBuilder`] +pub trait StringLikeArrayBuilder: ArrayBuilder { + /// Returns a human-readable type name for the builder. + fn type_name() -> &'static str; + + /// Creates a new builder with the given row capacity. + fn with_capacity(capacity: usize) -> Self; + + /// Appends a non-null string value to the builder. + fn append_value(&mut self, value: &str); + + /// Appends a null value to the builder. + fn append_null(&mut self); +} + +impl StringLikeArrayBuilder for GenericStringBuilder { + fn type_name() -> &'static str { + std::any::type_name::>() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] /// /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 2cdc3bedfb79..289492aafee9 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -25,7 +25,7 @@ use arrow_schema::ArrowError; use hashbrown::HashTable; use hashbrown::hash_table::Entry; -use crate::builder::ArrayBuilder; +use crate::builder::{ArrayBuilder, StringLikeArrayBuilder}; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayRef, GenericByteViewArray}; @@ -507,6 +507,21 @@ impl> Extend> /// ``` pub type StringViewBuilder = GenericByteViewBuilder; +impl StringLikeArrayBuilder for StringViewBuilder { + fn type_name() -> &'static str { + std::any::type_name::() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 5686d102d3fd..f378f3b16bd1 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -844,6 +844,16 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Variant::from(value) } + DataType::LargeUtf8 => { + let array = typed_value.as_string::(); + let value = array.value(index); + Variant::from(value) + } + DataType::Utf8View => { + let array = typed_value.as_string_view(); + let value = array.value(index); + Variant::from(value) + } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) } @@ -1007,14 +1017,14 @@ fn canonicalize_and_verify_data_type( // Binary and string are allowed. Force Binary to BinaryView because that's what the parquet // reader returns and what the rest of the variant code expects. Binary => Cow::Owned(DataType::BinaryView), - BinaryView | Utf8 => borrow!(), + BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), // UUID maps to 16-byte fixed-size binary; no other width is allowed FixedSizeBinary(16) => borrow!(), FixedSizeBinary(_) | FixedSizeList(..) => fail!(), // We can _possibly_ allow (some of) these some day? - LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => { + LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => { fail!() } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 66dfa3958729..ae1feec0bfcc 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -138,7 +138,6 @@ fn shredded_get_path( as_type, cast_options, target.len(), - target.inner().get_buffer_memory_size(), )?; for i in 0..target.len() { if target.is_null(i) { @@ -300,6 +299,8 @@ mod test { use crate::VariantArray; use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; + use arrow::array::LargeStringArray; + use arrow::array::StringViewArray; use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, @@ -771,6 +772,20 @@ mod test { StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) ); + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_large_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_large_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_view_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_view_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + macro_rules! perfectly_shredded_variant_array_fn { ($func:ident, $typed_value_gen:expr) => { fn $func() -> ArrayRef { @@ -798,6 +813,14 @@ mod test { StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) }); + perfectly_shredded_variant_array_fn!(perfectly_shredded_large_utf8_variant_array, || { + LargeStringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_view_variant_array, || { + StringViewArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) }); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 433d8af1984d..ae8a7ef87975 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,9 +16,10 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, LargeStringBuilder, NullBufferBuilder, PrimitiveBuilder, - StringBuilder, StringViewBuilder, builder::BooleanBuilder, + ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, StringLikeArrayBuilder, + builder::BooleanBuilder, }; +use arrow::array::{LargeStringBuilder, StringBuilder, StringViewBuilder}; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; use arrow::error::{ArrowError, Result}; @@ -53,12 +54,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), - StringView(VariantToUtf8ViewArrowBuilder<'a>), -} - -pub(crate) enum StringVariantToArrowRowBuilder<'a> { - Utf8(VariantToUtf8ArrowRowBuilder<'a>), - LargeUtf8(VariantToLargeUtf8ArrowBuilder<'a>), + String(VariantToStringArrowBuilder<'a, StringBuilder>), + LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>), + StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -68,7 +66,6 @@ pub(crate) enum StringVariantToArrowRowBuilder<'a> { pub(crate) enum VariantToArrowRowBuilder<'a> { Primitive(PrimitiveVariantToArrowRowBuilder<'a>), BinaryVariant(VariantToBinaryVariantArrowRowBuilder), - String(StringVariantToArrowRowBuilder<'a>), // Path extraction wrapper - contains a boxed enum for any of the above WithPath(VariantPathRowBuilder<'a>), } @@ -94,6 +91,8 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_null(), TimestampNanoNtz(b) => b.append_null(), Date(b) => b.append_null(), + String(b) => b.append_null(), + LargeString(b) => b.append_null(), StringView(b) => b.append_null(), } } @@ -118,6 +117,8 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_value(value), TimestampNanoNtz(b) => b.append_value(value), Date(b) => b.append_value(value), + String(b) => b.append_value(value), + LargeString(b) => b.append_value(value), StringView(b) => b.append_value(value), } } @@ -142,43 +143,18 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.finish(), TimestampNanoNtz(b) => b.finish(), Date(b) => b.finish(), + String(b) => b.finish(), + LargeString(b) => b.finish(), StringView(b) => b.finish(), } } } -impl<'a> StringVariantToArrowRowBuilder<'a> { - pub fn append_null(&mut self) -> Result<()> { - use StringVariantToArrowRowBuilder::*; - match self { - Utf8(b) => b.append_null(), - LargeUtf8(b) => b.append_null(), - } - } - - pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - use StringVariantToArrowRowBuilder::*; - match self { - Utf8(b) => b.append_value(value), - LargeUtf8(b) => b.append_value(value), - } - } - - pub fn finish(self) -> Result { - use StringVariantToArrowRowBuilder::*; - match self { - Utf8(b) => b.finish(), - LargeUtf8(b) => b.finish(), - } - } -} - impl<'a> VariantToArrowRowBuilder<'a> { pub fn append_null(&mut self) -> Result<()> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.append_null(), - String(b) => b.append_null(), BinaryVariant(b) => b.append_null(), WithPath(path_builder) => path_builder.append_null(), } @@ -188,7 +164,6 @@ impl<'a> VariantToArrowRowBuilder<'a> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.append_value(&value), - String(b) => b.append_value(&value), BinaryVariant(b) => b.append_value(value), WithPath(path_builder) => path_builder.append_value(value), } @@ -198,7 +173,6 @@ impl<'a> VariantToArrowRowBuilder<'a> { use VariantToArrowRowBuilder::*; match self { Primitive(b) => b.finish(), - String(b) => b.finish(), BinaryVariant(b) => b.finish(), WithPath(path_builder) => path_builder.finish(), } @@ -275,9 +249,11 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), - DataType::Utf8View => { - StringView(VariantToUtf8ViewArrowBuilder::new(cast_options, capacity)) + DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::LargeUtf8 => { + LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) } + DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)), _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -292,41 +268,12 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( Ok(builder) } -pub(crate) fn make_string_variant_to_arrow_row_builder<'a>( - data_type: &'a DataType, - cast_options: &'a CastOptions, - item_capacity: usize, - data_capacity: usize, -) -> Result> { - use StringVariantToArrowRowBuilder::{LargeUtf8, Utf8}; - - let builder = match data_type { - DataType::Utf8 => Utf8(VariantToUtf8ArrowRowBuilder::new( - cast_options, - item_capacity, - data_capacity, - )), - DataType::LargeUtf8 => LargeUtf8(VariantToLargeUtf8ArrowBuilder::new( - cast_options, - item_capacity, - data_capacity, - )), - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "Not a string type: {data_type:?}" - ))); - } - }; - Ok(builder) -} - pub(crate) fn make_variant_to_arrow_row_builder<'a>( metadata: &BinaryViewArray, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, capacity: usize, - data_capacity: usize, ) -> Result> { use VariantToArrowRowBuilder::*; @@ -336,15 +283,6 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( metadata.clone(), capacity, )), - Some(dt @ (DataType::Utf8 | DataType::LargeUtf8)) => { - let builder = make_string_variant_to_arrow_row_builder( - dt, - cast_options, - capacity, - data_capacity, - )?; - String(builder) - } Some(DataType::Struct(_)) => { return Err(ArrowError::NotYetImplemented( "Converting unshredded variant objects to arrow structs".to_string(), @@ -559,84 +497,3 @@ impl VariantToBinaryVariantArrowRowBuilder { Ok(ArrayRef::from(variant_array)) } } - -#[cfg(test)] -mod test { - use crate::variant_to_arrow::make_variant_to_arrow_row_builder; - use arrow::array::{Array, BinaryViewArray, LargeStringArray, StringViewArray}; - use arrow::compute::CastOptions; - use arrow::datatypes::DataType; - use parquet_variant::EMPTY_VARIANT_METADATA_BYTES; - use parquet_variant::{Variant, VariantPath}; - use std::sync::Arc; - - #[test] - fn test_large_utf8_variant_to_large_utf8_arrow() { - let metadata = - BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); - let cast_options = CastOptions::default(); - let path = VariantPath::default(); - let capacity = 3; - let data_capacity = 9usize; - let data_type = Some(&DataType::LargeUtf8); - - let mut builder = make_variant_to_arrow_row_builder( - &metadata, - path, - data_type, - &cast_options, - capacity, - data_capacity, - ) - .unwrap(); - - builder.append_value(Variant::from("foo")).unwrap(); - builder.append_value(Variant::from("bar")).unwrap(); - builder.append_value(Variant::from("baz")).unwrap(); - - let output = builder.finish().unwrap(); - - let expected = Arc::new(LargeStringArray::from(vec![ - Some("foo"), - Some("bar"), - Some("baz"), - ])); - - assert_eq!(&output.to_data(), &expected.to_data()); - } - - #[test] - fn test_utf8_view_variant_to_utf8_view_arrow() { - let metadata = - BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); - let cast_options = CastOptions::default(); - let path = VariantPath::default(); - let capacity = 3; - let data_capacity = 9usize; - let data_type = Some(&DataType::Utf8View); - - let mut builder = make_variant_to_arrow_row_builder( - &metadata, - path, - data_type, - &cast_options, - capacity, - data_capacity, - ) - .unwrap(); - - builder.append_value(Variant::from("foo")).unwrap(); - builder.append_value(Variant::from("bar")).unwrap(); - builder.append_value(Variant::from("baz")).unwrap(); - - let output = builder.finish().unwrap(); - - let expected = Arc::new(StringViewArray::from(vec![ - Some("foo"), - Some("bar"), - Some("baz"), - ])); - - assert_eq!(&output.to_data(), &expected.to_data()); - } -} From 196b5d4d64a9234f2d7334e029eb56ff41c14027 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 20 Oct 2025 13:02:54 -0400 Subject: [PATCH 14/36] Fix Merge errors --- parquet-variant-compute/src/variant_get.rs | 3 +- .../src/variant_to_arrow.rs | 89 ++----------------- 2 files changed, 11 insertions(+), 81 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 3c790bf6884c..0f08496fd767 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -299,7 +299,8 @@ mod test { use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, - Int16Array, Int32Array, Int64Array, StringArray, LargeStringArray, StringViewArray, StructArray, + Int16Array, Int32Array, Int64Array, LargeStringArray, StringArray, StringViewArray, + StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index ea03c1e63d95..8d30a598ddf0 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,8 +16,8 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, StringLikeArrayBuilder, - builder::BooleanBuilder, LargeStringBuilder, StringBuilder, StringViewBuilder + ArrayRef, BinaryViewArray, LargeStringBuilder, NullBufferBuilder, PrimitiveBuilder, + StringBuilder, StringLikeArrayBuilder, StringViewBuilder, builder::BooleanBuilder, }; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; @@ -204,84 +204,6 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( ) -> Result> { use PrimitiveVariantToArrowRowBuilder::*; - let builder = match data_type { - DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)), - DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz( - VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), - ), - DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano( - VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), - ), - DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new( - cast_options, - capacity, - )), - DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), - DataType::LargeUtf8 => { - LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) - } - DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)), - _ if data_type.is_primitive() => { - return Err(ArrowError::NotYetImplemented(format!( - "Primitive data_type {data_type:?} not yet implemented" - ))); - } - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "Not a primitive type: {data_type:?}" - ))); - } - }; let builder = match data_type { DataType::Boolean => { @@ -359,6 +281,13 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), + DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::LargeUtf8 => { + LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) + } + DataType::Utf8View => { + StringView(VariantToStringArrowBuilder::new(cast_options, capacity)) + } _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" From 642d1929a6127d6242c8a965e02e3f047d5180d1 Mon Sep 17 00:00:00 2001 From: Kosta Tarasov <33369833+sdf-jkl@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:30:21 -0400 Subject: [PATCH 15/36] Update arrow-array/src/builder/generic_bytes_builder.rs Co-authored-by: Ryan Johnson --- arrow-array/src/builder/generic_bytes_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 5a482f4c254d..bc33d37e542d 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -369,7 +369,7 @@ pub trait StringLikeArrayBuilder: ArrayBuilder { impl StringLikeArrayBuilder for GenericStringBuilder { fn type_name() -> &'static str { - std::any::type_name::>() + std::any::type_name::() } fn with_capacity(capacity: usize) -> Self { Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH) From 76b3c803648c262c21ecdbb5dc989738f0e474b8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 21 Oct 2025 10:13:02 -0400 Subject: [PATCH 16/36] Add docs for AVERAGE_STRING_LENGTH const --- arrow-array/src/builder/generic_bytes_builder.rs | 10 ++++++++++ parquet-variant-compute/src/variant_to_arrow.rs | 1 + 2 files changed, 11 insertions(+) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 5a482f4c254d..6127a5cd4ddf 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -348,6 +348,16 @@ impl std::fmt::Write for GenericStringBuilder { } } +/// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`] +/// +/// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \ +/// - `item_capacity` - the row count \ +/// - `data_capacity` - total string byte count \ +/// +/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \ +/// +/// These capacities are preallocation hints used to improve performance, +/// but consuquences of passing a hint too large or too small should be negligible. const AVERAGE_STRING_LENGTH: usize = 16; /// Trait for string-like array builders /// diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 8d30a598ddf0..f98c31a2301c 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -71,6 +71,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { pub(crate) enum VariantToArrowRowBuilder<'a> { Primitive(PrimitiveVariantToArrowRowBuilder<'a>), BinaryVariant(VariantToBinaryVariantArrowRowBuilder), + // Path extraction wrapper - contains a boxed enum for any of the above WithPath(VariantPathRowBuilder<'a>), } From 5914218915fac05e6583f3cae92244a514831f5d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 21 Oct 2025 10:22:29 -0400 Subject: [PATCH 17/36] cargo fmt --- arrow-array/src/builder/generic_bytes_builder.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 4b9cbfc65e29..f743b3191607 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -349,13 +349,13 @@ impl std::fmt::Write for GenericStringBuilder { } /// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`] -/// +/// /// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \ /// - `item_capacity` - the row count \ /// - `data_capacity` - total string byte count \ -/// +/// /// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \ -/// +/// /// These capacities are preallocation hints used to improve performance, /// but consuquences of passing a hint too large or too small should be negligible. const AVERAGE_STRING_LENGTH: usize = 16; From 216d401e620dd4c6b96cab8aa2e7e7be81fc442c Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 21 Oct 2025 10:23:07 -0400 Subject: [PATCH 18/36] cargo fmt --- parquet-variant-compute/src/variant_to_arrow.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index f98c31a2301c..7065669982a5 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -71,7 +71,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { pub(crate) enum VariantToArrowRowBuilder<'a> { Primitive(PrimitiveVariantToArrowRowBuilder<'a>), BinaryVariant(VariantToBinaryVariantArrowRowBuilder), - + // Path extraction wrapper - contains a boxed enum for any of the above WithPath(VariantPathRowBuilder<'a>), } From 04b9941c61c3ed3f810e927bb16b16758f56acc5 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 24 Oct 2025 11:16:40 -0400 Subject: [PATCH 19/36] Quick fix variant_get --- parquet-variant-compute/src/variant_array.rs | 34 ++++++++++++++++ parquet-variant-compute/src/variant_get.rs | 39 ++++++++++--------- .../src/variant_to_arrow.rs | 4 +- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 60c5487c512f..d7d4e06eaba9 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -870,6 +870,40 @@ impl From> for ShreddingState { } } +pub enum ShreddingStateCow<'a> { + Owned(ShreddingState), + Borrowed(BorrowedShreddingState<'a>), +} + +impl<'a> From for ShreddingStateCow<'a> { + fn from(s: ShreddingState) -> Self { + Self::Owned(s) + } +} +impl<'a> From> for ShreddingStateCow<'a> { + fn from(s: BorrowedShreddingState<'a>) -> Self { + Self::Borrowed(s) + } +} + +impl<'a> ShreddingStateCow<'a> { + /// Always gives the caller a borrowed view, even if we own internally. + pub fn as_view(&self) -> BorrowedShreddingState<'_> { + match self { + ShreddingStateCow::Borrowed(b) => b.clone(), + ShreddingStateCow::Owned(o) => o.borrow(), + } + } + + /// Materialize ownership when the caller needs to keep it. + pub fn into_owned(self) -> ShreddingState { + match self { + ShreddingStateCow::Borrowed(b) => b.into(), + ShreddingStateCow::Owned(o) => o, + } + } +} + /// Builds struct arrays from component fields /// /// TODO: move to arrow crate diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 79a613757c65..c292d3e65a05 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -16,25 +16,24 @@ // under the License. use arrow::{ array::{self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, UInt32Array}, - compute::{take, CastOptions}, + compute::{CastOptions, take}, datatypes::Field, error::Result, }; use arrow_schema::{ArrowError, DataType, FieldRef}; -use parquet_variant::{VariantPath, VariantPathElement, EMPTY_VARIANT_METADATA_BYTES}; +use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantPath, VariantPathElement}; -use crate::variant_array::{ShreddingState, StructArrayBuilder}; -use crate::variant_to_arrow::make_variant_to_arrow_row_builder; -use crate::VariantArray; use crate::variant_array::BorrowedShreddingState; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; +use crate::{ShreddingState, variant_array::StructArrayBuilder}; +use crate::{VariantArray, variant_array::ShreddingStateCow}; use arrow::array::AsArray; use std::sync::Arc; pub(crate) enum ShreddedPathStep<'a> { /// Path step succeeded, return the new shredding state - Success(BorrowedShreddingState<'a>), + Success(ShreddingStateCow<'a>), /// The path element is not present in the `typed_value` column and there is no `value` column, /// so we know it does not exist. It, and all paths under it, are all-NULL. Missing, @@ -99,7 +98,7 @@ pub(crate) fn follow_shredded_path_element<'a>( })?; let state = BorrowedShreddingState::try_from(struct_array)?; - Ok(ShreddedPathStep::Success(state)) + Ok(ShreddedPathStep::Success(state.into())) } VariantPathElement::Index { index } => { // TODO: Support array indexing. Among other things, it will require slicing not @@ -126,7 +125,7 @@ pub(crate) fn follow_shredded_path_element<'a>( }; let Some(typed_array) = struct_array.column_by_name("typed_value") else { - return Ok(missing_path_step()) + return Ok(missing_path_step()); }; // Build the list of indices to take @@ -158,7 +157,8 @@ pub(crate) fn follow_shredded_path_element<'a>( .with_field("typed_value", taken, true) .build(); - Ok(ShreddedPathStep::Success(struct_array.into())) + let state = ShreddingState::try_from(struct_array)?; + Ok(ShreddedPathStep::Success(state.into())) } } } @@ -205,20 +205,21 @@ fn shredded_get_path( // Peel away the prefix of path elements that traverses the shredded parts of this variant // column. Shredding will traverse the rest of the path on a per-row basis. - let mut shredding_state = input.shredding_state().borrow(); + let mut shredding_state = ShreddingStateCow::Borrowed(input.shredding_state().borrow()); let mut accumulated_nulls = input.inner().nulls().cloned(); let mut path_index = 0; for path_element in path { - match follow_shredded_path_element(&shredding_state, path_element, cast_options)? { + match follow_shredded_path_element(&shredding_state.as_view(), path_element, cast_options)? + { ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed - if let Some(typed_value) = shredding_state.typed_value_field() { + if let Some(typed_value) = shredding_state.as_view().typed_value_field() { accumulated_nulls = arrow::buffer::NullBuffer::union( accumulated_nulls.as_ref(), typed_value.nulls(), ); } - shredding_state = state; + shredding_state = ShreddingStateCow::Owned(state.into_owned()); path_index += 1; continue; } @@ -232,7 +233,7 @@ fn shredded_get_path( } ShreddedPathStep::NotShredded => { let target = make_target_variant( - shredding_state.value_field().cloned(), + shredding_state.as_view().value_field().cloned(), None, accumulated_nulls, ); @@ -243,8 +244,8 @@ fn shredded_get_path( // Path exhausted! Create a new `VariantArray` for the location we landed on. let target = make_target_variant( - shredding_state.value_field().cloned(), - shredding_state.typed_value_field().cloned(), + shredding_state.as_view().value_field().cloned(), + shredding_state.as_view().typed_value_field().cloned(), accumulated_nulls, ); @@ -355,9 +356,9 @@ mod test { use crate::{VariantArray, VariantArrayBuilder, VariantValueArrayBuilder, json_to_variant}; use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array, - Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, - GenericListArray, Int16Array, Int32Array, Int64Array, NullBuilder, LargeStringArray, StringArray, StringViewArray, - StructArray, Time64MicrosecondArray, + Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, + GenericListArray, Int8Array, Int16Array, Int32Array, Int64Array, LargeStringArray, + NullBuilder, StringArray, StringViewArray, StructArray, Time64MicrosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::compute::CastOptions; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 3ed272b7ad5c..8e57816a48a8 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,8 +16,8 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, BooleanBuilder, NullArray, NullBufferBuilder, PrimitiveBuilder, - LargeStringBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, + ArrayRef, BinaryViewArray, BooleanBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, + PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, }; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; From 6d6793d17cc2634c8df2de5aeafd2744f5981f6a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 11 Nov 2025 00:29:14 -0500 Subject: [PATCH 20/36] fix merge errors --- parquet-variant-compute/src/variant_array.rs | 10 ---------- parquet-variant-compute/src/variant_get.rs | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index eec8ebf47ccf..9bcdc4762e1c 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -1011,16 +1011,6 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Ok(Variant::from(value)) } - DataType::LargeUtf8 => { - let array = typed_value.as_string::(); - let value = array.value(index); - Variant::from(value) - } - DataType::Utf8View => { - let array = typed_value.as_string_view(); - let value = array.value(index); - Variant::from(value) - } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 4f5f119ce144..51b4d1a83c57 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -366,8 +366,8 @@ mod test { use arrow::array::{ Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, - Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray, - LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray, + Float64Array, GenericListArray, Int8Array, Int16Array, Int32Array, Int64Array, + LargeBinaryArray, LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray, Time64MicrosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer}; From 9cd01d2143221be2d95cc9f649d1b46004bb6069 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 18:58:49 -0500 Subject: [PATCH 21/36] Simplify tests using shred_variant --- parquet-variant-compute/src/variant_get.rs | 80 +++++----------------- 1 file changed, 17 insertions(+), 63 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index be0fd2974ef6..7a4c79e58200 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -389,13 +389,13 @@ mod test { use super::{GetOptions, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; - use crate::{VariantArray, VariantArrayBuilder, VariantValueArrayBuilder, json_to_variant}; + use crate::{VariantArray, VariantArrayBuilder, json_to_variant, shred_variant}; use arrow::array::{ Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, - LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray, - ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray, + LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, + ListArray, ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; @@ -1902,9 +1902,7 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } - /// This test manually constructs a shredded variant array representing lists - /// like ["comedy", "drama"] and ["horror", 123] - /// as VariantArray using variant_get. + /// This test uses a pre-shredded list array and validates index-path access. #[test] fn test_shredded_list_index_access() { let array = shredded_list_variant_array(); @@ -1919,7 +1917,7 @@ mod test { // Row 1: expect 0 index = "horror" assert_eq!(result_variant.value(1), Variant::from("horror")); } - /// Test extracting shredded list field with type conversion + /// Test extracting shredded list field with type conversion. #[test] fn test_shredded_list_as_string() { let array = shredded_list_variant_array(); @@ -1932,65 +1930,21 @@ mod test { let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); assert_eq!(&result, &expected); } - /// Helper function to create a shredded variant array representing lists - /// - /// This creates an array that represents: - /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) - /// Row 1: ["horror", 123] ([0] is shredded, [1] is int - partially shredded) + /// Helper to create a shredded list variant array used by list index tests. /// - /// The physical layout follows the shredding spec where: - /// - metadata: contains list metadata - /// - typed_value: StructArray with 0 index value - /// - value: contains fallback for + /// Rows: + /// 1. `["comedy", "drama"]` (fully shred-able as `Utf8`) + /// 2. `["horror", 123]` (partially shredded, with fallback for the numeric element) fn shredded_list_variant_array() -> ArrayRef { - // Create metadata array - let metadata_array = - BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2)); - - // Building the typed_value ListArray - - let mut variant_value_builder = VariantValueArrayBuilder::new(8); - variant_value_builder.append_null(); - variant_value_builder.append_null(); - variant_value_builder.append_null(); - variant_value_builder.append_value(Variant::from(123i32)); - - let struct_array = StructArrayBuilder::new() - .with_field( - "value", - Arc::new(variant_value_builder.build().unwrap()), - true, - ) - .with_field( - "typed_value", - Arc::new(StringArray::from(vec![ - Some("comedy"), - Some("drama"), - Some("horror"), - None, - ])), - true, - ) - .build(); - - let typed_value_array = GenericListArray::::new( - Arc::new(Field::new_list_field( - struct_array.data_type().clone(), - true, - )), - OffsetBuffer::from_lengths([2, 2]), - Arc::new(struct_array), - None, - ); - - // Build the main VariantArray - let main_struct = crate::variant_array::StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata_array), false) - // .with_field("value", Arc::new(value_array), true) - .with_field("typed_value", Arc::new(typed_value_array), true) - .build(); + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"["comedy", "drama"]"#), + Some(r#"["horror", 123]"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); - Arc::new(main_struct) + let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredded = shred_variant(&input, &list_schema).unwrap(); + ArrayRef::from(shredded) } /// Helper function to create a shredded variant array representing objects /// From cecd39f8d6020c4569e157c69c30bafbd48dbb62 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 19:17:33 -0500 Subject: [PATCH 22/36] Add tests suggested by @klion26 --- parquet-variant-compute/src/variant_get.rs | 123 ++++++++++++++++++++- 1 file changed, 120 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 7a4c79e58200..fefb4046a3f0 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -389,13 +389,15 @@ mod test { use super::{GetOptions, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; - use crate::{VariantArray, VariantArrayBuilder, json_to_variant, shred_variant}; + use crate::{ + ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, json_to_variant, shred_variant, + }; use arrow::array::{ Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, - LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, - ListArray, ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray, + LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray, + ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; @@ -1867,6 +1869,7 @@ mod test { Arc::new(struct_array) } + /// This test manually constructs a shredded variant array representing objects /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field /// as VariantArray using variant_get. @@ -1902,6 +1905,7 @@ mod test { let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); assert_eq!(&result, &expected); } + /// This test uses a pre-shredded list array and validates index-path access. #[test] fn test_shredded_list_index_access() { @@ -1917,6 +1921,7 @@ mod test { // Row 1: expect 0 index = "horror" assert_eq!(result_variant.value(1), Variant::from("horror")); } + /// Test extracting shredded list field with type conversion. #[test] fn test_shredded_list_as_string() { @@ -1930,6 +1935,75 @@ mod test { let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); assert_eq!(&result, &expected); } + + #[test] + fn test_shredded_list_index_access_from_value_field() { + let array = shredded_list_variant_array(); + // Index 1 maps to "drama" for row 0, and to fallback value 123 for row 1. + let options = GetOptions::new_with_path(VariantPath::from(1)); + let result = variant_get(&array, options).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result_variant.value(0), Variant::from("drama")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); + } + + #[test] + fn test_shredded_list_index_access_from_value_field_as_int64() { + let array = shredded_list_variant_array(); + let field = Field::new("typed_value", DataType::Int64, true); + let options = GetOptions::new_with_path(VariantPath::from(1)) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // "drama" -> NULL, 123 -> 123. + let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, Some(123)])); + assert_eq!(&result, &expected); + } + + #[test] + fn test_shredded_list_in_struct_index_access() { + let array = shredded_struct_with_list_variant_array(); + let options = GetOptions::new_with_path(VariantPath::try_from("a").unwrap().join(1)); + let result = variant_get(&array, options).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result_variant.value(0), Variant::from("drama")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); + } + + #[test] + fn test_shredded_struct_in_list_field_access() { + let array = shredded_list_of_struct_variant_array(); + let field = Field::new("x", DataType::Int32, true); + let path = VariantPath::from(0).join("x"); + let options = GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(3)])); + assert_eq!(&result, &expected); + } + + #[test] + fn test_shredded_list_of_lists_index_access() { + let array = shredded_list_of_lists_variant_array(); + let path = VariantPath::from(0).join(1); + + let result = variant_get(&array, GetOptions::new_with_path(path.clone())).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + assert_eq!(result_variant.value(0), Variant::from("b")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); + + let field = Field::new("typed_value", DataType::Int64, true); + let casted = variant_get( + &array, + GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field))), + ) + .unwrap(); + let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, Some(123)])); + assert_eq!(&casted, &expected); + } + /// Helper to create a shredded list variant array used by list index tests. /// /// Rows: @@ -1946,6 +2020,49 @@ mod test { let shredded = shred_variant(&input, &list_schema).unwrap(); ArrayRef::from(shredded) } + + fn shredded_struct_with_list_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"{"a": ["comedy", "drama"]}"#), + Some(r#"{"a": ["horror", 123]}"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); + + let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredding_schema = ShreddedSchemaBuilder::default() + .with_path("a", &list_schema) + .unwrap() + .build(); + let shredded = shred_variant(&input, &shredding_schema).unwrap(); + ArrayRef::from(shredded) + } + + fn shredded_list_of_struct_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"[{"x": 1}, {"x": 2}]"#), + Some(r#"[{"x": 3}, {"y": 4}]"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); + + let struct_type = + DataType::Struct(Fields::from(vec![Field::new("x", DataType::Int32, true)])); + let list_schema = DataType::List(Arc::new(Field::new("item", struct_type, true))); + let shredded = shred_variant(&input, &list_schema).unwrap(); + ArrayRef::from(shredded) + } + + fn shredded_list_of_lists_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"[["a", "b"], ["c", "d"]]"#), + Some(r#"[["x", 123], ["y", "z"]]"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); + + let inner_list = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let outer_list = DataType::List(Arc::new(Field::new("item", inner_list, true))); + let shredded = shred_variant(&input, &outer_list).unwrap(); + ArrayRef::from(shredded) + } /// Helper function to create a shredded variant array representing objects /// /// This creates an array that represents: From a776982e6824055e1f79c75ef53b1ec956bb8751 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 20 Feb 2026 19:32:11 -0500 Subject: [PATCH 23/36] Fix typed and untyped values logic --- parquet-variant-compute/src/variant_get.rs | 34 +++++++++++++++------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fefb4046a3f0..4702e17e9f40 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -101,8 +101,6 @@ pub(crate) fn follow_shredded_path_element<'a>( Ok(ShreddedPathStep::Success(state.into())) } VariantPathElement::Index { index } => { - // TODO: Support array indexing. Among other things, it will require slicing not - // only the array we have here, but also the corresponding metadata and null masks. let Some(list_array) = typed_value.as_any().downcast_ref::>() else { // Downcast failure - if strict cast options are enabled, this should be an error @@ -124,9 +122,13 @@ pub(crate) fn follow_shredded_path_element<'a>( return Ok(missing_path_step()); }; - let Some(typed_array) = struct_array.column_by_name("typed_value") else { + let value_array = struct_array.column_by_name("value"); + let typed_array = struct_array.column_by_name("typed_value"); + + // If list elements have neither typed nor fallback value, this path step is missing. + if value_array.is_none() && typed_array.is_none() { return Ok(missing_path_step()); - }; + } // Build the list of indices to take let mut take_indices = Vec::with_capacity(list_array.len()); @@ -144,18 +146,28 @@ pub(crate) fn follow_shredded_path_element<'a>( let index_array = UInt32Array::from(take_indices); - // Use Arrow compute kernel to gather elements - let taken = take(typed_array, &index_array, None)?; + // Gather both typed and fallback values at the requested element index. + let taken_value = value_array + .map(|value| take(value, &index_array, None)) + .transpose()?; + let taken_typed = typed_array + .map(|typed| take(typed, &index_array, None)) + .transpose()?; let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, - taken.len(), + index_array.len(), )); - let struct_array = &StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata_array), false) - .with_field("typed_value", taken, true) - .build(); + let mut builder = + StructArrayBuilder::new().with_field("metadata", Arc::new(metadata_array), false); + if let Some(taken_value) = taken_value { + builder = builder.with_field("value", taken_value, true); + } + if let Some(taken_typed) = taken_typed { + builder = builder.with_field("typed_value", taken_typed, true); + } + let struct_array = &builder.build(); let state = ShreddingState::try_from(struct_array)?; Ok(ShreddedPathStep::Success(state.into())) From cfe7c00c41ad9565078561a7eb4d7ce50d4a3cd1 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 21 Feb 2026 13:40:31 -0500 Subject: [PATCH 24/36] Add support for LargeListArray + OBB err when safe_cast --- parquet-variant-compute/src/variant_get.rs | 231 +++++++++++++++------ 1 file changed, 170 insertions(+), 61 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 4702e17e9f40..8712243a7044 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, UInt32Array}, + array::{ + self, Array, ArrayRef, BinaryViewArray, GenericListArray, OffsetSizeTrait, StructArray, + UInt64Array, + }, compute::{CastOptions, take}, datatypes::Field, error::Result, @@ -43,11 +46,79 @@ pub(crate) enum ShreddedPathStep<'a> { NotShredded, } +fn take_list_index_as_shredding_state( + list_array: &GenericListArray, + index: usize, + cast_options: &CastOptions, +) -> Result> { + let offsets = list_array.offsets(); + let values = list_array.values(); + + let Some(struct_array) = values.as_any().downcast_ref::() else { + return Ok(None); + }; + + let value_array = struct_array.column_by_name("value"); + let typed_array = struct_array.column_by_name("typed_value"); + + // If list elements have neither typed nor fallback value, this path step is missing. + if value_array.is_none() && typed_array.is_none() { + return Ok(None); + } + + let mut take_indices = Vec::with_capacity(list_array.len()); + for row in 0..list_array.len() { + let start = offsets[row].as_usize(); + let end = offsets[row + 1].as_usize(); + let len = end - start; + + if index < len { + let absolute_index = start.checked_add(index).ok_or_else(|| { + ArrowError::ComputeError("List index overflow while building take indices".into()) + })?; + let absolute_index = u64::try_from(absolute_index) + .map_err(|_| ArrowError::ComputeError("List index does not fit into u64".into()))?; + take_indices.push(Some(absolute_index)); + } else if cast_options.safe { + take_indices.push(None); + } else { + return Err(ArrowError::CastError(format!( + "Cannot access index '{}' for row {} with list length {}", + index, row, len + ))); + } + } + + let index_array = UInt64Array::from(take_indices); + + // Gather both typed and fallback values at the requested element index. + let taken_value = value_array + .map(|value| take(value, &index_array, None)) + .transpose()?; + let taken_typed = typed_array + .map(|typed| take(typed, &index_array, None)) + .transpose()?; + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + index_array.len(), + )); + + let mut builder = + StructArrayBuilder::new().with_field("metadata", Arc::new(metadata_array), false); + if let Some(taken_value) = taken_value { + builder = builder.with_field("value", taken_value, true); + } + if let Some(taken_typed) = taken_typed { + builder = builder.with_field("typed_value", taken_typed, true); + } + + Ok(Some(ShreddingState::try_from(&builder.build())?)) +} + /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step /// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. -/// -/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. pub(crate) fn follow_shredded_path_element<'a>( shredding_state: &BorrowedShreddingState<'a>, path_element: &VariantPathElement<'_>, @@ -101,8 +172,15 @@ pub(crate) fn follow_shredded_path_element<'a>( Ok(ShreddedPathStep::Success(state.into())) } VariantPathElement::Index { index } => { - let Some(list_array) = typed_value.as_any().downcast_ref::>() - else { + let state = if let Some(list_array) = + typed_value.as_any().downcast_ref::>() + { + take_list_index_as_shredding_state(list_array, *index, cast_options)? + } else if let Some(list_array) = + typed_value.as_any().downcast_ref::>() + { + take_list_index_as_shredding_state(list_array, *index, cast_options)? + } else { // Downcast failure - if strict cast options are enabled, this should be an error if !cast_options.safe { return Err(ArrowError::CastError(format!( @@ -115,62 +193,10 @@ pub(crate) fn follow_shredded_path_element<'a>( return Ok(missing_path_step()); }; - let offsets = list_array.offsets(); - let values = list_array.values(); // This is a StructArray - - let Some(struct_array) = values.as_any().downcast_ref::() else { - return Ok(missing_path_step()); - }; - - let value_array = struct_array.column_by_name("value"); - let typed_array = struct_array.column_by_name("typed_value"); - - // If list elements have neither typed nor fallback value, this path step is missing. - if value_array.is_none() && typed_array.is_none() { - return Ok(missing_path_step()); - } - - // Build the list of indices to take - let mut take_indices = Vec::with_capacity(list_array.len()); - for i in 0..list_array.len() { - let start = offsets[i] as usize; - let end = offsets[i + 1] as usize; - let len = end - start; - - if *index < len { - take_indices.push(Some((start + index) as u32)); - } else { - take_indices.push(None); - } - } - - let index_array = UInt32Array::from(take_indices); - - // Gather both typed and fallback values at the requested element index. - let taken_value = value_array - .map(|value| take(value, &index_array, None)) - .transpose()?; - let taken_typed = typed_array - .map(|typed| take(typed, &index_array, None)) - .transpose()?; - - let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n( - EMPTY_VARIANT_METADATA_BYTES, - index_array.len(), - )); - - let mut builder = - StructArrayBuilder::new().with_field("metadata", Arc::new(metadata_array), false); - if let Some(taken_value) = taken_value { - builder = builder.with_field("value", taken_value, true); - } - if let Some(taken_typed) = taken_typed { - builder = builder.with_field("typed_value", taken_typed, true); + match state { + Some(state) => Ok(ShreddedPathStep::Success(state.into())), + None => Ok(missing_path_step()), } - let struct_array = &builder.build(); - - let state = ShreddingState::try_from(struct_array)?; - Ok(ShreddedPathStep::Success(state.into())) } } } @@ -399,7 +425,7 @@ mod test { use std::str::FromStr; use std::sync::Arc; - use super::{GetOptions, variant_get}; + use super::{GetOptions, take_list_index_as_shredding_state, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::{ ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, json_to_variant, shred_variant, @@ -1973,6 +1999,54 @@ mod test { assert_eq!(&result, &expected); } + // The tests below are temp before: https://github.com/apache/arrow-rs/issues/9455 + #[test] + fn test_shredded_list_index_out_of_bounds_unsafe_cast_errors() { + let options = + GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { + safe: false, + ..Default::default() + }); + + let err = variant_get(&shredded_list_variant_array(), options.clone()).unwrap_err(); + assert!(err.to_string().contains("Cannot access index '10'")); + } + + #[test] + fn test_large_list_index_path_helper() { + let large_list = large_list_of_shredded_elements(); + let state = take_list_index_as_shredding_state(&large_list, 1, &CastOptions::default()) + .unwrap() + .unwrap(); + + let typed = state + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value = state.value_field().unwrap(); + assert_eq!(typed.value(0), "drama"); + assert!(typed.is_null(1)); + assert!(value.is_null(0)); + assert!(value.is_valid(1)); + } + + #[test] + fn test_large_list_index_out_of_bounds_unsafe_cast_errors() { + let large_list = large_list_of_shredded_elements(); + let err = take_list_index_as_shredding_state( + &large_list, + 10, + &CastOptions { + safe: false, + ..Default::default() + }, + ) + .unwrap_err(); + assert!(err.to_string().contains("Cannot access index '10'")); + } + #[test] fn test_shredded_list_in_struct_index_access() { let array = shredded_struct_with_list_variant_array(); @@ -2075,6 +2149,41 @@ mod test { let shredded = shred_variant(&input, &outer_list).unwrap(); ArrayRef::from(shredded) } + + fn large_list_of_shredded_elements() -> LargeListArray { + let fallback_array: ArrayRef = { + let mut builder = VariantBuilder::new(); + builder.append_value(Variant::from(123i32)); + let (_, value_bytes) = builder.finish(); + Arc::new(BinaryViewArray::from(vec![ + None, + None, + None, + Some(value_bytes.as_slice()), + ])) + }; + let typed_array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("comedy"), + Some("drama"), + Some("horror"), + None, + ])); + + let struct_array = StructArrayBuilder::new() + .with_field("value", fallback_array, true) + .with_field("typed_value", typed_array, true) + .build(); + + LargeListArray::new( + Arc::new(Field::new_list_field( + struct_array.data_type().clone(), + true, + )), + OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 4])), + Arc::new(struct_array), + None, + ) + } /// Helper function to create a shredded variant array representing objects /// /// This creates an array that represents: From fc99bf0c7fc4ef30be0d889c96e39ca148a3c5d0 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 21 Feb 2026 17:06:36 -0500 Subject: [PATCH 25/36] Use ShreddingState instead of BorrowedShreddingState in ShreddedPathStep::Success --- parquet-variant-compute/src/variant_array.rs | 34 -------------------- parquet-variant-compute/src/variant_get.rs | 34 +++++++++----------- 2 files changed, 16 insertions(+), 52 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 8ae786047309..250852d021bd 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -894,40 +894,6 @@ impl From> for ShreddingState { } } -pub enum ShreddingStateCow<'a> { - Owned(ShreddingState), - Borrowed(BorrowedShreddingState<'a>), -} - -impl<'a> From for ShreddingStateCow<'a> { - fn from(s: ShreddingState) -> Self { - Self::Owned(s) - } -} -impl<'a> From> for ShreddingStateCow<'a> { - fn from(s: BorrowedShreddingState<'a>) -> Self { - Self::Borrowed(s) - } -} - -impl<'a> ShreddingStateCow<'a> { - /// Always gives the caller a borrowed view, even if we own internally. - pub fn as_view(&self) -> BorrowedShreddingState<'_> { - match self { - ShreddingStateCow::Borrowed(b) => b.clone(), - ShreddingStateCow::Owned(o) => o.borrow(), - } - } - - /// Materialize ownership when the caller needs to keep it. - pub fn into_owned(self) -> ShreddingState { - match self { - ShreddingStateCow::Borrowed(b) => b.into(), - ShreddingStateCow::Owned(o) => o, - } - } -} - /// Builds struct arrays from component fields /// /// TODO: move to arrow crate diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 8712243a7044..fbefe35525a4 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -26,17 +26,16 @@ use arrow::{ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantPath, VariantPathElement}; -use crate::variant_array::BorrowedShreddingState; +use crate::VariantArray; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::{ShreddingState, variant_array::StructArrayBuilder}; -use crate::{VariantArray, variant_array::ShreddingStateCow}; use arrow::array::AsArray; use std::sync::Arc; -pub(crate) enum ShreddedPathStep<'a> { +pub(crate) enum ShreddedPathStep { /// Path step succeeded, return the new shredding state - Success(ShreddingStateCow<'a>), + Success(ShreddingState), /// The path element is not present in the `typed_value` column and there is no `value` column, /// so we know it does not exist. It, and all paths under it, are all-NULL. Missing, @@ -119,11 +118,11 @@ fn take_list_index_as_shredding_state( /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step /// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. -pub(crate) fn follow_shredded_path_element<'a>( - shredding_state: &BorrowedShreddingState<'a>, +pub(crate) fn follow_shredded_path_element( + shredding_state: &ShreddingState, path_element: &VariantPathElement<'_>, cast_options: &CastOptions, -) -> Result> { +) -> Result { // If the requested path element is not present in `typed_value`, and `value` is missing, then // we know it does not exist; it, and all paths under it, are all-NULL. let missing_path_step = || match shredding_state.value_field() { @@ -168,8 +167,8 @@ pub(crate) fn follow_shredded_path_element<'a>( )) })?; - let state = BorrowedShreddingState::try_from(struct_array)?; - Ok(ShreddedPathStep::Success(state.into())) + let state = ShreddingState::try_from(struct_array)?; + Ok(ShreddedPathStep::Success(state)) } VariantPathElement::Index { index } => { let state = if let Some(list_array) = @@ -194,7 +193,7 @@ pub(crate) fn follow_shredded_path_element<'a>( }; match state { - Some(state) => Ok(ShreddedPathStep::Success(state.into())), + Some(state) => Ok(ShreddedPathStep::Success(state)), None => Ok(missing_path_step()), } } @@ -252,21 +251,20 @@ fn shredded_get_path( // Peel away the prefix of path elements that traverses the shredded parts of this variant // column. Shredding will traverse the rest of the path on a per-row basis. - let mut shredding_state = ShreddingStateCow::Borrowed(input.shredding_state().borrow()); + let mut shredding_state = input.shredding_state().clone(); let mut accumulated_nulls = input.inner().nulls().cloned(); let mut path_index = 0; for path_element in path { - match follow_shredded_path_element(&shredding_state.as_view(), path_element, cast_options)? - { + match follow_shredded_path_element(&shredding_state, path_element, cast_options)? { ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed - if let Some(typed_value) = shredding_state.as_view().typed_value_field() { + if let Some(typed_value) = shredding_state.typed_value_field() { accumulated_nulls = arrow::buffer::NullBuffer::union( accumulated_nulls.as_ref(), typed_value.nulls(), ); } - shredding_state = ShreddingStateCow::Owned(state.into_owned()); + shredding_state = state; path_index += 1; continue; } @@ -280,7 +278,7 @@ fn shredded_get_path( } ShreddedPathStep::NotShredded => { let target = make_target_variant( - shredding_state.as_view().value_field().cloned(), + shredding_state.value_field().cloned(), None, accumulated_nulls, ); @@ -291,8 +289,8 @@ fn shredded_get_path( // Path exhausted! Create a new `VariantArray` for the location we landed on. let target = make_target_variant( - shredding_state.as_view().value_field().cloned(), - shredding_state.as_view().typed_value_field().cloned(), + shredding_state.value_field().cloned(), + shredding_state.typed_value_field().cloned(), accumulated_nulls, ); From ccbf59b2e40f80274e5e5350b2b49d1c175cce5a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 23 Feb 2026 15:40:08 -0500 Subject: [PATCH 26/36] Reuse ShreddingState methods --- parquet-variant-compute/src/variant_get.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fbefe35525a4..4efa6c519e65 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -56,9 +56,10 @@ fn take_list_index_as_shredding_state( let Some(struct_array) = values.as_any().downcast_ref::() else { return Ok(None); }; + let shredding_state = ShreddingState::try_from(struct_array)?; - let value_array = struct_array.column_by_name("value"); - let typed_array = struct_array.column_by_name("typed_value"); + let value_array = shredding_state.value_field(); + let typed_array = shredding_state.typed_value_field(); // If list elements have neither typed nor fallback value, this path step is missing. if value_array.is_none() && typed_array.is_none() { From cbfa058d527ed460bfd66f327624c26953c146ed Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 25 Feb 2026 11:21:00 -0500 Subject: [PATCH 27/36] nit fix --- parquet-variant-compute/src/variant_get.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 4efa6c519e65..1f96a5c6294a 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -518,7 +518,7 @@ mod test { fn get_primitive_variant_inside_object_of_list() { single_variant_get_test( r#"{"some_field": [1234]}"#, - VariantPath::try_from("some_field").unwrap().join(0), + VariantPath::try_from("some_field[0]").unwrap(), "1234", ); } @@ -2049,7 +2049,7 @@ mod test { #[test] fn test_shredded_list_in_struct_index_access() { let array = shredded_struct_with_list_variant_array(); - let options = GetOptions::new_with_path(VariantPath::try_from("a").unwrap().join(1)); + let options = GetOptions::new_with_path(VariantPath::try_from("a[1]").unwrap()); let result = variant_get(&array, options).unwrap(); let result_variant = VariantArray::try_new(&result).unwrap(); From cf94d439ed910197c0780c73c8b42eabf16db51d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 25 Feb 2026 11:32:24 -0500 Subject: [PATCH 28/36] use else if chain --- parquet-variant-compute/src/variant_get.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 1f96a5c6294a..b760146e3a58 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -180,17 +180,16 @@ pub(crate) fn follow_shredded_path_element( typed_value.as_any().downcast_ref::>() { take_list_index_as_shredding_state(list_array, *index, cast_options)? - } else { - // Downcast failure - if strict cast options are enabled, this should be an error - if !cast_options.safe { - return Err(ArrowError::CastError(format!( - "Cannot access index '{}' on non-list type: {}", - index, - typed_value.data_type() - ))); - } + } else if cast_options.safe { // With safe cast options, return NULL (missing_path_step) return Ok(missing_path_step()); + } else { + // Downcast failure - if strict cast options are enabled, this should be an error + return Err(ArrowError::CastError(format!( + "Cannot access index '{}' on non-list type: {}", + index, + typed_value.data_type() + ))); }; match state { From 91589adc8fb7812d37e1cb0525072b1d6ea56354 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 25 Feb 2026 11:36:16 -0500 Subject: [PATCH 29/36] add cast_options.safe docs --- parquet-variant-compute/src/variant_get.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index b760146e3a58..c8f8ab3bda2b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -45,6 +45,10 @@ pub(crate) enum ShreddedPathStep { NotShredded, } +/// Build the next shredding state by taking one list element (at `index`) per input row. +/// +/// With `cast_options.safe = true`, out-of-bounds indices become nulls for those rows. +/// With `cast_options.safe = false`, out-of-bounds indices return [`ArrowError::CastError`]. fn take_list_index_as_shredding_state( list_array: &GenericListArray, index: usize, @@ -119,6 +123,16 @@ fn take_list_index_as_shredding_state( /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step /// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. +/// +/// Safe-cast behavior (`cast_options.safe = true`): +/// - Type mismatch during path traversal (for example field access on non-struct, index access on +/// non-list) returns [`ShreddedPathStep::Missing`] or [`ShreddedPathStep::NotShredded`], allowing +/// the caller to continue with null/fallback semantics. +/// - List index out-of-bounds produces nulls for the corresponding rows. +/// +/// Unsafe-cast behavior (`cast_options.safe = false`): +/// - Type mismatch during path traversal returns [`ArrowError::CastError`]. +/// - List index out-of-bounds returns [`ArrowError::CastError`]. pub(crate) fn follow_shredded_path_element( shredding_state: &ShreddingState, path_element: &VariantPathElement<'_>, From 28ec53c328d66f6d931c28513816a8135bd39985 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 25 Feb 2026 22:52:17 -0500 Subject: [PATCH 30/36] support list-like arrays --- parquet-variant-compute/src/variant_get.rs | 192 +++++++++++++-------- 1 file changed, 120 insertions(+), 72 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index c8f8ab3bda2b..6a1c4a419f23 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -16,8 +16,8 @@ // under the License. use arrow::{ array::{ - self, Array, ArrayRef, BinaryViewArray, GenericListArray, OffsetSizeTrait, StructArray, - UInt64Array, + self, Array, ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, + StructArray, UInt64Array, }, compute::{CastOptions, take}, datatypes::Field, @@ -27,6 +27,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantPath, VariantPathElement}; use crate::VariantArray; +use crate::arrow_to_variant::ListLikeArray; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::{ShreddingState, variant_array::StructArrayBuilder}; @@ -45,16 +46,15 @@ pub(crate) enum ShreddedPathStep { NotShredded, } -/// Build the next shredding state by taking one list element (at `index`) per input row. +/// Build the next shredding state by taking one list-like element (at `index`) per input row. /// /// With `cast_options.safe = true`, out-of-bounds indices become nulls for those rows. /// With `cast_options.safe = false`, out-of-bounds indices return [`ArrowError::CastError`]. -fn take_list_index_as_shredding_state( - list_array: &GenericListArray, +fn take_list_like_index_as_shredding_state( + list_array: &L, index: usize, cast_options: &CastOptions, ) -> Result> { - let offsets = list_array.offsets(); let values = list_array.values(); let Some(struct_array) = values.as_any().downcast_ref::() else { @@ -72,16 +72,18 @@ fn take_list_index_as_shredding_state( let mut take_indices = Vec::with_capacity(list_array.len()); for row in 0..list_array.len() { - let start = offsets[row].as_usize(); - let end = offsets[row + 1].as_usize(); - let len = end - start; + let row_range = list_array.element_range(row); + let len = row_range.len(); if index < len { - let absolute_index = start.checked_add(index).ok_or_else(|| { - ArrowError::ComputeError("List index overflow while building take indices".into()) + let absolute_index = row_range.start.checked_add(index).ok_or_else(|| { + ArrowError::ComputeError( + "List-like index overflow while building take indices".into(), + ) + })?; + let absolute_index = u64::try_from(absolute_index).map_err(|_| { + ArrowError::ComputeError("List-like index does not fit into u64".into()) })?; - let absolute_index = u64::try_from(absolute_index) - .map_err(|_| ArrowError::ComputeError("List index does not fit into u64".into()))?; take_indices.push(Some(absolute_index)); } else if cast_options.safe { take_indices.push(None); @@ -189,11 +191,21 @@ pub(crate) fn follow_shredded_path_element( let state = if let Some(list_array) = typed_value.as_any().downcast_ref::>() { - take_list_index_as_shredding_state(list_array, *index, cast_options)? + take_list_like_index_as_shredding_state(list_array, *index, cast_options)? } else if let Some(list_array) = typed_value.as_any().downcast_ref::>() { - take_list_index_as_shredding_state(list_array, *index, cast_options)? + take_list_like_index_as_shredding_state(list_array, *index, cast_options)? + } else if let Some(list_view_array) = typed_value + .as_any() + .downcast_ref::>() + { + take_list_like_index_as_shredding_state(list_view_array, *index, cast_options)? + } else if let Some(list_view_array) = typed_value + .as_any() + .downcast_ref::>() + { + take_list_like_index_as_shredding_state(list_view_array, *index, cast_options)? } else if cast_options.safe { // With safe cast options, return NULL (missing_path_step) return Ok(missing_path_step()); @@ -437,7 +449,7 @@ mod test { use std::str::FromStr; use std::sync::Arc; - use super::{GetOptions, take_list_index_as_shredding_state, variant_get}; + use super::{GetOptions, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::{ ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, json_to_variant, shred_variant, @@ -2011,7 +2023,6 @@ mod test { assert_eq!(&result, &expected); } - // The tests below are temp before: https://github.com/apache/arrow-rs/issues/9455 #[test] fn test_shredded_list_index_out_of_bounds_unsafe_cast_errors() { let options = @@ -2025,37 +2036,72 @@ mod test { } #[test] - fn test_large_list_index_path_helper() { - let large_list = large_list_of_shredded_elements(); - let state = take_list_index_as_shredding_state(&large_list, 1, &CastOptions::default()) - .unwrap() - .unwrap(); + fn test_shredded_large_list_index_access_from_value_field() { + let array = shredded_large_list_variant_array(); + // Index 1 maps to "drama" for row 0, and to fallback value 123 for row 1. + let options = GetOptions::new_with_path(VariantPath::from(1)); + let result = variant_get(&array, options).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); - let typed = state - .typed_value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let value = state.value_field().unwrap(); - assert_eq!(typed.value(0), "drama"); - assert!(typed.is_null(1)); - assert!(value.is_null(0)); - assert!(value.is_valid(1)); + assert_eq!(result_variant.value(0), Variant::from("drama")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); } #[test] - fn test_large_list_index_out_of_bounds_unsafe_cast_errors() { - let large_list = large_list_of_shredded_elements(); - let err = take_list_index_as_shredding_state( - &large_list, - 10, - &CastOptions { + fn test_shredded_large_list_index_out_of_bounds_unsafe_cast_errors() { + let options = + GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { safe: false, ..Default::default() - }, - ) - .unwrap_err(); + }); + + let err = variant_get(&shredded_large_list_variant_array(), options).unwrap_err(); + assert!(err.to_string().contains("Cannot access index '10'")); + } + + #[test] + fn test_shredded_list_view_index_access_from_value_field() { + let array = shredded_list_view_variant_array(); + let options = GetOptions::new_with_path(VariantPath::from(1)); + let result = variant_get(&array, options).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result_variant.value(0), Variant::from("drama")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); + } + + #[test] + fn test_shredded_list_view_index_out_of_bounds_unsafe_cast_errors() { + let options = + GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { + safe: false, + ..Default::default() + }); + + let err = variant_get(&shredded_list_view_variant_array(), options).unwrap_err(); + assert!(err.to_string().contains("Cannot access index '10'")); + } + + #[test] + fn test_shredded_large_list_view_index_access_from_value_field() { + let array = shredded_large_list_view_variant_array(); + let options = GetOptions::new_with_path(VariantPath::from(1)); + let result = variant_get(&array, options).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result_variant.value(0), Variant::from("drama")); + assert_eq!(result_variant.value(1).as_int64(), Some(123)); + } + + #[test] + fn test_shredded_large_list_view_index_out_of_bounds_unsafe_cast_errors() { + let options = + GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { + safe: false, + ..Default::default() + }); + + let err = variant_get(&shredded_large_list_view_variant_array(), options).unwrap_err(); assert!(err.to_string().contains("Cannot access index '10'")); } @@ -2162,39 +2208,41 @@ mod test { ArrayRef::from(shredded) } - fn large_list_of_shredded_elements() -> LargeListArray { - let fallback_array: ArrayRef = { - let mut builder = VariantBuilder::new(); - builder.append_value(Variant::from(123i32)); - let (_, value_bytes) = builder.finish(); - Arc::new(BinaryViewArray::from(vec![ - None, - None, - None, - Some(value_bytes.as_slice()), - ])) - }; - let typed_array: ArrayRef = Arc::new(StringArray::from(vec![ - Some("comedy"), - Some("drama"), - Some("horror"), - None, + fn shredded_large_list_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"["comedy", "drama"]"#), + Some(r#"["horror", 123]"#), ])); + let input = json_to_variant(&json_rows).unwrap(); - let struct_array = StructArrayBuilder::new() - .with_field("value", fallback_array, true) - .with_field("typed_value", typed_array, true) - .build(); + let list_schema = DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredded = shred_variant(&input, &list_schema).unwrap(); + ArrayRef::from(shredded) + } - LargeListArray::new( - Arc::new(Field::new_list_field( - struct_array.data_type().clone(), - true, - )), - OffsetBuffer::new(ScalarBuffer::from(vec![0_i64, 2, 4])), - Arc::new(struct_array), - None, - ) + fn shredded_list_view_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"["comedy", "drama"]"#), + Some(r#"["horror", 123]"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); + + let list_schema = DataType::ListView(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredded = shred_variant(&input, &list_schema).unwrap(); + ArrayRef::from(shredded) + } + + fn shredded_large_list_view_variant_array() -> ArrayRef { + let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ + Some(r#"["comedy", "drama"]"#), + Some(r#"["horror", 123]"#), + ])); + let input = json_to_variant(&json_rows).unwrap(); + + let list_schema = + DataType::LargeListView(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredded = shred_variant(&input, &list_schema).unwrap(); + ArrayRef::from(shredded) } /// Helper function to create a shredded variant array representing objects /// From 279b6347e92a3beeb046ea13678a7714e32b0acc Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 2 Mar 2026 14:09:30 -0500 Subject: [PATCH 31/36] match typed value instead of donwcast attempts --- parquet-variant-compute/src/variant_get.rs | 71 +++++++++++++--------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 6a1c4a419f23..13a7d4b30bcf 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -50,11 +50,19 @@ pub(crate) enum ShreddedPathStep { /// /// With `cast_options.safe = true`, out-of-bounds indices become nulls for those rows. /// With `cast_options.safe = false`, out-of-bounds indices return [`ArrowError::CastError`]. -fn take_list_like_index_as_shredding_state( - list_array: &L, +fn take_list_like_index_as_shredding_state( + typed_value: &dyn Array, index: usize, cast_options: &CastOptions, ) -> Result> { + let list_array = typed_value.as_any().downcast_ref::().ok_or_else(|| { + ArrowError::ComputeError(format!( + "Expected array type '{}' while handling list-like path step, got '{}'", + std::any::type_name::(), + typed_value.data_type() + )) + })?; + let values = list_array.values(); let Some(struct_array) = values.as_any().downcast_ref::() else { @@ -188,34 +196,37 @@ pub(crate) fn follow_shredded_path_element( Ok(ShreddedPathStep::Success(state)) } VariantPathElement::Index { index } => { - let state = if let Some(list_array) = - typed_value.as_any().downcast_ref::>() - { - take_list_like_index_as_shredding_state(list_array, *index, cast_options)? - } else if let Some(list_array) = - typed_value.as_any().downcast_ref::>() - { - take_list_like_index_as_shredding_state(list_array, *index, cast_options)? - } else if let Some(list_view_array) = typed_value - .as_any() - .downcast_ref::>() - { - take_list_like_index_as_shredding_state(list_view_array, *index, cast_options)? - } else if let Some(list_view_array) = typed_value - .as_any() - .downcast_ref::>() - { - take_list_like_index_as_shredding_state(list_view_array, *index, cast_options)? - } else if cast_options.safe { - // With safe cast options, return NULL (missing_path_step) - return Ok(missing_path_step()); - } else { - // Downcast failure - if strict cast options are enabled, this should be an error - return Err(ArrowError::CastError(format!( - "Cannot access index '{}' on non-list type: {}", - index, - typed_value.data_type() - ))); + let state = match typed_value.data_type() { + DataType::List(_) => take_list_like_index_as_shredding_state::< + GenericListArray, + >(typed_value.as_ref(), *index, cast_options)?, + DataType::LargeList(_) => take_list_like_index_as_shredding_state::< + GenericListArray, + >( + typed_value.as_ref(), *index, cast_options + )?, + DataType::ListView(_) => take_list_like_index_as_shredding_state::< + GenericListViewArray, + >( + typed_value.as_ref(), *index, cast_options + )?, + DataType::LargeListView(_) => take_list_like_index_as_shredding_state::< + GenericListViewArray, + >( + typed_value.as_ref(), *index, cast_options + )?, + _ if cast_options.safe => { + // With safe cast options, return NULL (missing_path_step) + return Ok(missing_path_step()); + } + _ => { + // Downcast failure - if strict cast options are enabled, this should be an error + return Err(ArrowError::CastError(format!( + "Cannot access index '{}' on non-list type: {}", + index, + typed_value.data_type() + ))); + } }; match state { From 18d04d718103fd1334b54bc01cad24c8dcb9d132 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 10 Mar 2026 11:25:15 -0400 Subject: [PATCH 32/36] clean up tests --- parquet-variant-compute/src/variant_get.rs | 253 ++++++++------------- 1 file changed, 91 insertions(+), 162 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 13a7d4b30bcf..1fd65d25b7d8 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -1979,141 +1979,75 @@ mod test { assert_eq!(&result, &expected); } - /// This test uses a pre-shredded list array and validates index-path access. - #[test] - fn test_shredded_list_index_access() { - let array = shredded_list_variant_array(); - // Test: Extract the 0 index field as VariantArray first - let options = GetOptions::new_with_path(VariantPath::from(0)); - let result = variant_get(&array, options).unwrap(); - let result_variant = VariantArray::try_new(&result).unwrap(); - assert_eq!(result_variant.len(), 2); - - // Row 0: expect 0 index = "comedy" - assert_eq!(result_variant.value(0), Variant::from("comedy")); - // Row 1: expect 0 index = "horror" - assert_eq!(result_variant.value(1), Variant::from("horror")); - } - - /// Test extracting shredded list field with type conversion. - #[test] - fn test_shredded_list_as_string() { - let array = shredded_list_variant_array(); - // Test: Extract the 0 index values as StringArray (type conversion) - let field = Field::new("typed_value", DataType::Utf8, false); - let options = GetOptions::new_with_path(VariantPath::from(0)) - .with_as_type(Some(FieldRef::from(field))); - let result = variant_get(&array, options).unwrap(); - // Should get StringArray - let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); - assert_eq!(&result, &expected); - } - - #[test] - fn test_shredded_list_index_access_from_value_field() { - let array = shredded_list_variant_array(); - // Index 1 maps to "drama" for row 0, and to fallback value 123 for row 1. - let options = GetOptions::new_with_path(VariantPath::from(1)); - let result = variant_get(&array, options).unwrap(); - let result_variant = VariantArray::try_new(&result).unwrap(); - - assert_eq!(result_variant.value(0), Variant::from("drama")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); - } - - #[test] - fn test_shredded_list_index_access_from_value_field_as_int64() { - let array = shredded_list_variant_array(); - let field = Field::new("typed_value", DataType::Int64, true); - let options = GetOptions::new_with_path(VariantPath::from(1)) - .with_as_type(Some(FieldRef::from(field))); - let result = variant_get(&array, options).unwrap(); - - // "drama" -> NULL, 123 -> 123. - let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, Some(123)])); - assert_eq!(&result, &expected); - } - - #[test] - fn test_shredded_list_index_out_of_bounds_unsafe_cast_errors() { - let options = - GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { - safe: false, - ..Default::default() - }); + type ShreddedListLikeArrayGen = fn() -> ArrayRef; + type ShreddedListLikeCase = (&'static str, ShreddedListLikeArrayGen); - let err = variant_get(&shredded_list_variant_array(), options.clone()).unwrap_err(); - assert!(err.to_string().contains("Cannot access index '10'")); + fn shredded_list_like_cases() -> [ShreddedListLikeCase; 4] { + [ + ("list", shredded_list_variant_array), + ("large_list", shredded_large_list_variant_array), + ("list_view", shredded_list_view_variant_array), + ("large_list_view", shredded_large_list_view_variant_array), + ] } #[test] - fn test_shredded_large_list_index_access_from_value_field() { - let array = shredded_large_list_variant_array(); - // Index 1 maps to "drama" for row 0, and to fallback value 123 for row 1. + fn test_shredded_list_like_index_access_from_value_field() { let options = GetOptions::new_with_path(VariantPath::from(1)); - let result = variant_get(&array, options).unwrap(); - let result_variant = VariantArray::try_new(&result).unwrap(); - - assert_eq!(result_variant.value(0), Variant::from("drama")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); - } - - #[test] - fn test_shredded_large_list_index_out_of_bounds_unsafe_cast_errors() { - let options = - GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { - safe: false, - ..Default::default() - }); - - let err = variant_get(&shredded_large_list_variant_array(), options).unwrap_err(); - assert!(err.to_string().contains("Cannot access index '10'")); - } - #[test] - fn test_shredded_list_view_index_access_from_value_field() { - let array = shredded_list_view_variant_array(); - let options = GetOptions::new_with_path(VariantPath::from(1)); - let result = variant_get(&array, options).unwrap(); - let result_variant = VariantArray::try_new(&result).unwrap(); + for (case, array_gen) in shredded_list_like_cases() { + let array = array_gen(); + let result = variant_get(&array, options.clone()).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); - assert_eq!(result_variant.value(0), Variant::from("drama")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); + assert_eq!(result_variant.value(0), Variant::from("drama"), "{case}"); + assert_eq!(result_variant.value(1).as_int64(), Some(123), "{case}"); + } } #[test] - fn test_shredded_list_view_index_out_of_bounds_unsafe_cast_errors() { + fn test_shredded_list_like_index_out_of_bounds_unsafe_cast_errors() { let options = GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { safe: false, ..Default::default() }); - let err = variant_get(&shredded_list_view_variant_array(), options).unwrap_err(); - assert!(err.to_string().contains("Cannot access index '10'")); + for (case, array_gen) in shredded_list_like_cases() { + let err = variant_get(&array_gen(), options.clone()).unwrap_err(); + assert!( + err.to_string().contains("Cannot access index '10'"), + "{case}" + ); + } } + /// Test extracting shredded list-like field with type conversion. #[test] - fn test_shredded_large_list_view_index_access_from_value_field() { - let array = shredded_large_list_view_variant_array(); - let options = GetOptions::new_with_path(VariantPath::from(1)); - let result = variant_get(&array, options).unwrap(); - let result_variant = VariantArray::try_new(&result).unwrap(); + fn test_shredded_list_like_as_string() { + let field = Field::new("typed_value", DataType::Utf8, false); + let options = GetOptions::new_with_path(VariantPath::from(0)) + .with_as_type(Some(FieldRef::from(field))); + let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")])); - assert_eq!(result_variant.value(0), Variant::from("drama")); - assert_eq!(result_variant.value(1).as_int64(), Some(123)); + for (case, array_gen) in shredded_list_like_cases() { + let result = variant_get(&array_gen(), options.clone()).unwrap(); + assert_eq!(&result, &expected, "{case}"); + } } #[test] - fn test_shredded_large_list_view_index_out_of_bounds_unsafe_cast_errors() { - let options = - GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { - safe: false, - ..Default::default() - }); + fn test_shredded_list_like_index_access_from_value_field_as_int64() { + let field = Field::new("typed_value", DataType::Int64, true); + let options = GetOptions::new_with_path(VariantPath::from(1)) + .with_as_type(Some(FieldRef::from(field))); + let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, Some(123)])); - let err = variant_get(&shredded_large_list_view_variant_array(), options).unwrap_err(); - assert!(err.to_string().contains("Cannot access index '10'")); + for (case, array_gen) in shredded_list_like_cases() { + let result = variant_get(&array_gen(), options.clone()).unwrap(); + // "drama" -> NULL, 123 -> 123. + assert_eq!(&result, &expected, "{case}"); + } } #[test] @@ -2159,49 +2093,18 @@ mod test { assert_eq!(&casted, &expected); } - /// Helper to create a shredded list variant array used by list index tests. + /// Helper to create a shredded list-like variant array used by list index tests. /// /// Rows: /// 1. `["comedy", "drama"]` (fully shred-able as `Utf8`) /// 2. `["horror", 123]` (partially shredded, with fallback for the numeric element) - fn shredded_list_variant_array() -> ArrayRef { + fn shredded_list_like_variant_array(list_schema: DataType) -> ArrayRef { let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ Some(r#"["comedy", "drama"]"#), Some(r#"["horror", 123]"#), ])); let input = json_to_variant(&json_rows).unwrap(); - let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); - let shredded = shred_variant(&input, &list_schema).unwrap(); - ArrayRef::from(shredded) - } - - fn shredded_struct_with_list_variant_array() -> ArrayRef { - let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ - Some(r#"{"a": ["comedy", "drama"]}"#), - Some(r#"{"a": ["horror", 123]}"#), - ])); - let input = json_to_variant(&json_rows).unwrap(); - - let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); - let shredding_schema = ShreddedSchemaBuilder::default() - .with_path("a", &list_schema) - .unwrap() - .build(); - let shredded = shred_variant(&input, &shredding_schema).unwrap(); - ArrayRef::from(shredded) - } - - fn shredded_list_of_struct_variant_array() -> ArrayRef { - let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ - Some(r#"[{"x": 1}, {"x": 2}]"#), - Some(r#"[{"x": 3}, {"y": 4}]"#), - ])); - let input = json_to_variant(&json_rows).unwrap(); - - let struct_type = - DataType::Struct(Fields::from(vec![Field::new("x", DataType::Int32, true)])); - let list_schema = DataType::List(Arc::new(Field::new("item", struct_type, true))); let shredded = shred_variant(&input, &list_schema).unwrap(); ArrayRef::from(shredded) } @@ -2219,42 +2122,68 @@ mod test { ArrayRef::from(shredded) } - fn shredded_large_list_variant_array() -> ArrayRef { - let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ - Some(r#"["comedy", "drama"]"#), - Some(r#"["horror", 123]"#), - ])); - let input = json_to_variant(&json_rows).unwrap(); + fn shredded_list_variant_array() -> ArrayRef { + shredded_list_like_variant_array(DataType::List(Arc::new(Field::new( + "item", + DataType::Utf8, + true, + )))) + } - let list_schema = DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, true))); - let shredded = shred_variant(&input, &list_schema).unwrap(); - ArrayRef::from(shredded) + fn shredded_large_list_variant_array() -> ArrayRef { + shredded_list_like_variant_array(DataType::LargeList(Arc::new(Field::new( + "item", + DataType::Utf8, + true, + )))) } fn shredded_list_view_variant_array() -> ArrayRef { + shredded_list_like_variant_array(DataType::ListView(Arc::new(Field::new( + "item", + DataType::Utf8, + true, + )))) + } + + fn shredded_large_list_view_variant_array() -> ArrayRef { + shredded_list_like_variant_array(DataType::LargeListView(Arc::new(Field::new( + "item", + DataType::Utf8, + true, + )))) + } + + fn shredded_struct_with_list_variant_array() -> ArrayRef { let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ - Some(r#"["comedy", "drama"]"#), - Some(r#"["horror", 123]"#), + Some(r#"{"a": ["comedy", "drama"]}"#), + Some(r#"{"a": ["horror", 123]}"#), ])); let input = json_to_variant(&json_rows).unwrap(); - let list_schema = DataType::ListView(Arc::new(Field::new("item", DataType::Utf8, true))); - let shredded = shred_variant(&input, &list_schema).unwrap(); + let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let shredding_schema = ShreddedSchemaBuilder::default() + .with_path("a", &list_schema) + .unwrap() + .build(); + let shredded = shred_variant(&input, &shredding_schema).unwrap(); ArrayRef::from(shredded) } - fn shredded_large_list_view_variant_array() -> ArrayRef { + fn shredded_list_of_struct_variant_array() -> ArrayRef { let json_rows: ArrayRef = Arc::new(StringArray::from(vec![ - Some(r#"["comedy", "drama"]"#), - Some(r#"["horror", 123]"#), + Some(r#"[{"x": 1}, {"x": 2}]"#), + Some(r#"[{"x": 3}, {"y": 4}]"#), ])); let input = json_to_variant(&json_rows).unwrap(); - let list_schema = - DataType::LargeListView(Arc::new(Field::new("item", DataType::Utf8, true))); + let struct_type = + DataType::Struct(Fields::from(vec![Field::new("x", DataType::Int32, true)])); + let list_schema = DataType::List(Arc::new(Field::new("item", struct_type, true))); let shredded = shred_variant(&input, &list_schema).unwrap(); ArrayRef::from(shredded) } + /// Helper function to create a shredded variant array representing objects /// /// This creates an array that represents: From 8d838cc3208164bb240e1dc5e173f4e564747b47 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 10 Mar 2026 11:30:44 -0400 Subject: [PATCH 33/36] ListLikeArray is now in arrow::array --- parquet-variant-compute/src/variant_get.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 1fd65d25b7d8..ae8e6ea16f8b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -17,7 +17,7 @@ use arrow::{ array::{ self, Array, ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, - StructArray, UInt64Array, + ListLikeArray, StructArray, UInt64Array, }, compute::{CastOptions, take}, datatypes::Field, @@ -27,7 +27,6 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantPath, VariantPathElement}; use crate::VariantArray; -use crate::arrow_to_variant::ListLikeArray; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::{ShreddingState, variant_array::StructArrayBuilder}; From b178554f35ad69a5e54f6845d7e8bab8ed0f2ce5 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 19 Mar 2026 10:58:52 -0400 Subject: [PATCH 34/36] fmt --- parquet-variant-compute/src/variant_get.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 96da692f524a..e6ac5ae00a37 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -462,9 +462,8 @@ mod test { use super::{GetOptions, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::{ - - ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, shred_variant, - , shred_variant, + ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, + shred_variant, }; use arrow::array::{ Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, From 66873a2bb4ae1b86c8c70831912b60790dd1fb61 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 24 Mar 2026 17:10:15 -0400 Subject: [PATCH 35/36] JSONpath semantics --- parquet-variant-compute/src/variant_get.rs | 79 ++++++++++++---------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index e6ac5ae00a37..afd7025092ba 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -47,12 +47,9 @@ pub(crate) enum ShreddedPathStep { /// Build the next shredding state by taking one list-like element (at `index`) per input row. /// -/// With `cast_options.safe = true`, out-of-bounds indices become nulls for those rows. -/// With `cast_options.safe = false`, out-of-bounds indices return [`ArrowError::CastError`]. fn take_list_like_index_as_shredding_state( typed_value: &dyn Array, index: usize, - cast_options: &CastOptions, ) -> Result> { let list_array = typed_value.as_any().downcast_ref::().ok_or_else(|| { ArrowError::ComputeError(format!( @@ -92,13 +89,8 @@ fn take_list_like_index_as_shredding_state( ArrowError::ComputeError("List-like index does not fit into u64".into()) })?; take_indices.push(Some(absolute_index)); - } else if cast_options.safe { - take_indices.push(None); } else { - return Err(ArrowError::CastError(format!( - "Cannot access index '{}' for row {} with list length {}", - index, row, len - ))); + take_indices.push(None); } } @@ -140,8 +132,9 @@ fn take_list_like_index_as_shredding_state( /// - List index out-of-bounds produces nulls for the corresponding rows. /// /// Unsafe-cast behavior (`cast_options.safe = false`): -/// - Type mismatch during path traversal returns [`ArrowError::CastError`]. -/// - List index out-of-bounds returns [`ArrowError::CastError`]. +/// - Field access on non-struct returns [`ArrowError::CastError`]. +/// - List index path steps follow JSONPath semantics and return missing/null for non-list or +/// out-of-bounds rows. pub(crate) fn follow_shredded_path_element( shredding_state: &ShreddingState, path_element: &VariantPathElement<'_>, @@ -198,33 +191,19 @@ pub(crate) fn follow_shredded_path_element( let state = match typed_value.data_type() { DataType::List(_) => take_list_like_index_as_shredding_state::< GenericListArray, - >(typed_value.as_ref(), *index, cast_options)?, + >(typed_value.as_ref(), *index)?, DataType::LargeList(_) => take_list_like_index_as_shredding_state::< GenericListArray, - >( - typed_value.as_ref(), *index, cast_options - )?, + >(typed_value.as_ref(), *index)?, DataType::ListView(_) => take_list_like_index_as_shredding_state::< GenericListViewArray, - >( - typed_value.as_ref(), *index, cast_options - )?, + >(typed_value.as_ref(), *index)?, DataType::LargeListView(_) => take_list_like_index_as_shredding_state::< GenericListViewArray, - >( - typed_value.as_ref(), *index, cast_options - )?, - _ if cast_options.safe => { - // With safe cast options, return NULL (missing_path_step) - return Ok(missing_path_step()); - } + >(typed_value.as_ref(), *index)?, _ => { - // Downcast failure - if strict cast options are enabled, this should be an error - return Err(ArrowError::CastError(format!( - "Cannot access index '{}' on non-list type: {}", - index, - typed_value.data_type() - ))); + // JSONPath semantics: indexing a non-list yields no match. + return Ok(missing_path_step()); } }; @@ -1906,7 +1885,7 @@ mod test { } #[test] - fn test_shredded_list_like_index_out_of_bounds_unsafe_cast_errors() { + fn test_shredded_list_like_index_out_of_bounds_unsafe_cast_returns_null() { let options = GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions { safe: false, @@ -1914,11 +1893,10 @@ mod test { }); for (case, array_gen) in shredded_list_like_cases() { - let err = variant_get(&array_gen(), options.clone()).unwrap_err(); - assert!( - err.to_string().contains("Cannot access index '10'"), - "{case}" - ); + let result = variant_get(&array_gen(), options.clone()).unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); + assert_eq!(result_variant.value(0), Variant::Null, "{case}"); + assert_eq!(result_variant.value(1), Variant::Null, "{case}"); } } @@ -2828,6 +2806,33 @@ mod test { ); } + #[test] + fn test_strict_cast_options_index_on_non_list_returns_null() { + use arrow::compute::CastOptions; + use arrow::datatypes::{DataType, Field}; + use parquet_variant::VariantPath; + use std::sync::Arc; + + // Use existing test data that has Int32 typed_value at the top level. + let variant_array = perfectly_shredded_int32_variant_array(); + let options = GetOptions { + path: VariantPath::from(0), + as_type: Some(Arc::new(Field::new("result", DataType::Int32, true))), + cast_options: CastOptions { + safe: false, + ..Default::default() + }, + }; + + let variant_array_ref: Arc = variant_array.clone(); + let result = variant_get(&variant_array_ref, options).unwrap(); + + assert_eq!(result.len(), 3); + assert!(result.is_null(0)); + assert!(result.is_null(1)); + assert!(result.is_null(2)); + } + #[test] fn test_error_message_boolean_type_display() { let mut builder = VariantArrayBuilder::new(1); From 2135838994216e059c3c3ddc50f7f6dca30893f6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 25 Mar 2026 11:33:47 -0400 Subject: [PATCH 36/36] fix index overflow by @scovich --- parquet-variant-compute/src/variant_get.rs | 17 ++--------------- parquet-variant/src/path.rs | 9 +++++++++ 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index afd7025092ba..205b001d07ea 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -77,21 +77,8 @@ fn take_list_like_index_as_shredding_state( let mut take_indices = Vec::with_capacity(list_array.len()); for row in 0..list_array.len() { let row_range = list_array.element_range(row); - let len = row_range.len(); - - if index < len { - let absolute_index = row_range.start.checked_add(index).ok_or_else(|| { - ArrowError::ComputeError( - "List-like index overflow while building take indices".into(), - ) - })?; - let absolute_index = u64::try_from(absolute_index).map_err(|_| { - ArrowError::ComputeError("List-like index does not fit into u64".into()) - })?; - take_indices.push(Some(absolute_index)); - } else { - take_indices.push(None); - } + let take_index = (index < row_range.len()).then(|| (row_range.start + index) as u64); + take_indices.push(take_index); } let index_array = UInt64Array::from(take_indices); diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index 8e68d9efadf2..0837e49f8abf 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -346,5 +346,14 @@ mod tests { err.to_string(), "Parser error: Invalid token in bracket request: `abc`. Expected a quoted string or a number(e.g., `['field']` or `[123]`)" ); + + // Out-of-range integer indexes are invalid path tokens. + let too_large_index = (usize::MAX as u128) + 1; + let err = VariantPath::try_from(format!("[{too_large_index}]").as_str()).unwrap_err(); + assert!( + err.to_string() + .contains("Parser error: Invalid token in bracket request"), + "{err}" + ); } }