Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 96 additions & 65 deletions parquet-variant-compute/src/shred_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant
let (value, typed_value, nulls) = builder.finish()?;
Ok(VariantArray::from_parts(
array.metadata_field().clone(),
Some(value),
Some(Arc::new(value) as ArrayRef),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm pretty sure the cast is unnecessary (automatic coercion by compiler)

(several more below)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've cleaned up all the ones I've added, but without tooling support for this lint its mostly a style thing

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: The main reason I pick on casts is because they're too powerful and can easily be quietly misused. Ideally, a cast is only present if actually required, as a clear marker to reviewer/reader that something unusual is going on.

Similar story for unnecessary type annotations, except those are just noise/bloat rather than a correctness risk.

My AI coding assistant sure loves splattering both all over the code it writes tho, super annoying.

Some(typed_value),
nulls,
))
Expand Down Expand Up @@ -443,8 +443,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
let mut builder = StructArrayBuilder::new();
for (field_name, typed_value_builder) in self.typed_value_builders {
let (value, typed_value, nulls) = typed_value_builder.finish()?;
let array =
ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
let array = ShreddedVariantFieldArray::from_parts(
Some(Arc::new(value) as ArrayRef),
Some(typed_value),
nulls,
);
builder = builder.with_field(field_name, ArrayRef::from(array), false);
}
if let Some(nulls) = self.typed_value_nulls.finish() {
Expand Down Expand Up @@ -689,6 +692,7 @@ impl VariantSchemaNode {
mod tests {
use super::*;
use crate::VariantArrayBuilder;
use crate::variant_array::binary_array_value;
use arrow::array::{
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
Expand Down Expand Up @@ -867,7 +871,8 @@ mod tests {
) {
assert_eq!(array.len(), expected_len);

let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field()));
let fallback_value = array.value_field().unwrap();
let fallback_metadata = array.metadata_field();
let array = downcast_list_like_array::<O>(array);

assert_eq!(
Expand All @@ -887,7 +892,7 @@ mod tests {
);
assert_eq!(
array.len(),
fallbacks.0.len(),
fallback_value.len(),
"fallbacks value field should match array length"
);

Expand All @@ -902,28 +907,33 @@ mod tests {
// Successfully shredded: typed list value present, no fallback value
assert!(array.is_valid(idx));
assert_eq!(array.value_size(idx), *len);
assert!(fallbacks.0.is_null(idx));
assert!(fallback_value.is_null(idx));
}
None => {
// Unable to shred: typed list value absent, fallback should carry the variant
assert!(array.is_null(idx));
assert_eq!(array.value_size(idx), O::zero());
match expected_fallback {
Some(expected_variant) => {
assert!(fallbacks.0.is_valid(idx));
let metadata_bytes = fallbacks
.1
.filter(|m| m.is_valid(idx))
.map(|m| m.value(idx))
.filter(|bytes| !bytes.is_empty())
.unwrap_or(EMPTY_VARIANT_METADATA_BYTES);
assert!(fallback_value.is_valid(idx));
let metadata_bytes =
binary_array_value(fallback_metadata.as_ref(), idx).unwrap();
let metadata_bytes =
if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() {
metadata_bytes
} else {
EMPTY_VARIANT_METADATA_BYTES
};
assert_eq!(
Variant::new(metadata_bytes, fallbacks.0.value(idx)),
Variant::new(
metadata_bytes,
binary_array_value(fallback_value.as_ref(), idx).unwrap()
),
expected_variant.clone()
);
}
None => {
assert!(fallbacks.0.is_null(idx));
assert!(fallback_value.is_null(idx));
}
}
}
Expand Down Expand Up @@ -983,7 +993,10 @@ mod tests {
Some(expected_variant) => {
assert!(element_fallbacks.is_valid(idx));
assert_eq!(
Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)),
Variant::new(
EMPTY_VARIANT_METADATA_BYTES,
binary_array_value(element_fallbacks.as_ref(), idx).unwrap()
),
expected_variant.clone()
);
}
Expand Down Expand Up @@ -1129,7 +1142,7 @@ mod tests {
#[test]
fn test_all_null_input() {
// Create VariantArray with no value field (all null case)
let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata
let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata
let all_null_array = VariantArray::from_parts(metadata, None, None, None);
let result = shred_variant(&all_null_array, &DataType::Int64).unwrap();

Expand Down Expand Up @@ -1243,7 +1256,10 @@ mod tests {
assert!(!value_field.is_null(1)); // value should contain original
assert!(typed_value_field.is_null(1)); // typed_value should be null
assert_eq!(
Variant::new(metadata_field.value(1), value_field.value(1)),
Variant::new(
binary_array_value(metadata_field.as_ref(), 1).unwrap(),
binary_array_value(value_field.as_ref(), 1).unwrap()
),
Variant::from("hello")
);

Expand All @@ -1259,7 +1275,10 @@ mod tests {
assert!(!result.is_null(4));
assert!(!value_field.is_null(4)); // should contain Variant::Null
assert_eq!(
Variant::new(metadata_field.value(4), value_field.value(4)),
Variant::new(
binary_array_value(metadata_field.as_ref(), 4).unwrap(),
binary_array_value(value_field.as_ref(), 4).unwrap()
),
Variant::Null
);
assert!(typed_value_field.is_null(4));
Expand Down Expand Up @@ -1336,7 +1355,10 @@ mod tests {
assert!(value.is_valid(1));
assert!(typed_value.is_null(1));
assert_eq!(
Variant::new(metadata.value(1), value.value(1)),
Variant::new(
binary_array_value(metadata.as_ref(), 1).unwrap(),
binary_array_value(value.as_ref(), 1).unwrap()
),
Variant::from(42i64)
);

Expand All @@ -1350,7 +1372,10 @@ mod tests {
assert!(value.is_valid(3));
assert!(typed_value.is_null(3));
assert_eq!(
Variant::new(metadata.value(3), value.value(3)),
Variant::new(
binary_array_value(metadata.as_ref(), 3).unwrap(),
binary_array_value(value.as_ref(), 3).unwrap()
),
Variant::Null
);

Expand Down Expand Up @@ -1392,7 +1417,10 @@ mod tests {
assert!(value.is_valid(1));
assert!(typed_value.is_null(1));
assert_eq!(
Variant::new(metadata.value(1), value.value(1)),
Variant::new(
binary_array_value(metadata.as_ref(), 1).unwrap(),
binary_array_value(value.as_ref(), 1).unwrap()
),
Variant::from("not_binary")
);

Expand All @@ -1406,7 +1434,10 @@ mod tests {
assert!(value.is_valid(3));
assert!(typed_value.is_null(3));
assert_eq!(
Variant::new(metadata.value(3), value.value(3)),
Variant::new(
binary_array_value(metadata.as_ref(), 3).unwrap(),
binary_array_value(value.as_ref(), 3).unwrap()
),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a lot of these, spread across several test modules. Do you think it would be helpful to define a test helper function that takes a pair of arrays (applying deref) plus an index, and encapsulates the binary array value+unwrap calls?

assert_eq!(
    variant_from_arrays_at(metadata, value, 3), 
    Ok(Variant::Null)
);

Variant::Null
);

Expand Down Expand Up @@ -1682,14 +1713,14 @@ mod tests {
.unwrap();
let outer_fallbacks = outer_elements.value_field().unwrap();

let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
outer_elements.len(),
));
))) as ArrayRef;
let outer_variant = VariantArray::from_parts(
outer_metadata,
Some(outer_fallbacks.clone()),
Some(Arc::new(outer_values.clone())),
Some(Arc::new(outer_values.clone()) as ArrayRef),
None,
);

Expand Down Expand Up @@ -1792,7 +1823,10 @@ mod tests {
// null is stored as Variant::Null in values
assert!(id_values.is_valid(1));
assert_eq!(
Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)),
Variant::new(
EMPTY_VARIANT_METADATA_BYTES,
binary_array_value(id_values.as_ref(), 1).unwrap()
),
Variant::Null
);
assert!(id_typed_values.is_null(1));
Expand Down Expand Up @@ -1866,7 +1900,6 @@ mod tests {
assert_eq!(result.len(), 9);

let metadata = result.metadata_field();

let value = result.value_field().unwrap();
let typed_value = result
.typed_value_field()
Expand All @@ -1882,24 +1915,14 @@ mod tests {
let age_field =
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap();

let score_value = score_field
.value_field()
.unwrap()
.as_any()
.downcast_ref::<BinaryViewArray>()
.unwrap();
let score_value = score_field.value_field().unwrap();
let score_typed_value = score_field
.typed_value_field()
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
let age_value = age_field
.value_field()
.unwrap()
.as_any()
.downcast_ref::<BinaryViewArray>()
.unwrap();
let age_value = age_field.value_field().unwrap();
let age_typed_value = age_field
.typed_value_field()
.unwrap()
Expand All @@ -1918,10 +1941,13 @@ mod tests {
}
fn get_value<'m, 'v>(
i: usize,
metadata: &'m BinaryViewArray,
value: &'v BinaryViewArray,
metadata: &'m dyn Array,
value: &'v dyn Array,
) -> Variant<'m, 'v> {
Variant::new(metadata.value(i), value.value(i))
Variant::new(
binary_array_value(metadata, i).unwrap(),
binary_array_value(value, i).unwrap(),
)
}
let expect = |i, expected_result: Option<ShreddedValue<ShreddedStruct>>| {
match expected_result {
Expand All @@ -1933,7 +1959,10 @@ mod tests {
match expected_value {
Some(expected_value) => {
assert!(value.is_valid(i));
assert_eq!(expected_value, get_value(i, metadata, value));
assert_eq!(
expected_value,
get_value(i, metadata.as_ref(), value.as_ref())
);
}
None => {
assert!(value.is_null(i));
Expand All @@ -1952,7 +1981,7 @@ mod tests {
assert!(score_value.is_valid(i));
assert_eq!(
expected_score_value,
get_value(i, metadata, score_value)
get_value(i, metadata.as_ref(), score_value.as_ref())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does deref coercion somehow not kick in here?
metadata: &Arc<dyn Array> (returned by VariantArray::metadata_field) should coerce automatically to &dyn Array.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah... that only works for concrete types, not dyn trait refs. Blech.

);
}
None => {
Expand All @@ -1973,7 +2002,7 @@ mod tests {
assert!(age_value.is_valid(i));
assert_eq!(
expected_age_value,
get_value(i, metadata, age_value)
get_value(i, metadata.as_ref(), age_value.as_ref())
);
}
None => {
Expand Down Expand Up @@ -2114,7 +2143,7 @@ mod tests {
// Helper to correctly create a variant object using a row's existing metadata
let object_with_foo_field = |i| {
use parquet_variant::{ParentState, ValueBuilder, VariantMetadata};
let metadata = VariantMetadata::new(metadata.value(i));
let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap());
let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
let mut value_builder = ValueBuilder::new();
let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
Expand Down Expand Up @@ -2213,7 +2242,10 @@ mod tests {
assert!(value_field.is_null(2));
assert!(value_field.is_valid(3));
assert_eq!(
Variant::new(result.metadata_field().value(3), value_field.value(3)),
Variant::new(
binary_array_value(result.metadata_field().as_ref(), 3).unwrap(),
binary_array_value(value_field.as_ref(), 3).unwrap()
),
Comment thread
AdamGS marked this conversation as resolved.
Outdated
Variant::from("not an object")
);
assert!(value_field.is_null(4));
Expand All @@ -2231,10 +2263,10 @@ mod tests {
.unwrap();
assert_list_structure_and_elements::<Int64Type, i32>(
&VariantArray::from_parts(
BinaryViewArray::from_iter_values(std::iter::repeat_n(
Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
EMPTY_VARIANT_METADATA_BYTES,
scores_field.len(),
)),
))) as ArrayRef,
Some(scores_field.value_field().unwrap().clone()),
Some(scores_field.typed_value_field().unwrap().clone()),
None,
Expand Down Expand Up @@ -2350,24 +2382,14 @@ mod tests {
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
.unwrap();

let id_value = id_field
.value_field()
.unwrap()
.as_any()
.downcast_ref::<BinaryViewArray>()
.unwrap();
let id_value = id_field.value_field().unwrap();
let id_typed_value = id_field
.typed_value_field()
.unwrap()
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
let session_id_value = session_id_field
.value_field()
.unwrap()
.as_any()
.downcast_ref::<BinaryViewArray>()
.unwrap();
let session_id_value = session_id_field.value_field().unwrap();
let session_id_typed_value = session_id_field
.typed_value_field()
.unwrap()
Expand Down Expand Up @@ -2404,7 +2426,10 @@ mod tests {
assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());

// Verify the value field contains the name field
let row_1_variant = Variant::new(metadata.value(1), value.value(1));
let row_1_variant = Variant::new(
binary_array_value(metadata.as_ref(), 1).unwrap(),
binary_array_value(value.as_ref(), 1).unwrap(),
);
let Variant::Object(obj) = row_1_variant else {
panic!("Expected object");
};
Expand Down Expand Up @@ -2436,7 +2461,10 @@ mod tests {

assert!(session_id_value.is_valid(3)); // type mismatch, stored in value
assert!(session_id_typed_value.is_null(3));
let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3));
let session_id_variant = Variant::new(
binary_array_value(metadata.as_ref(), 3).unwrap(),
binary_array_value(session_id_value.as_ref(), 3).unwrap(),
);
assert_eq!(session_id_variant, Variant::from("not-a-uuid"));

// Row 4: Type mismatch - id is int64, not UUID
Expand All @@ -2447,7 +2475,10 @@ mod tests {

assert!(id_value.is_valid(4)); // type mismatch, stored in value
assert!(id_typed_value.is_null(4));
let id_variant = Variant::new(metadata.value(4), id_value.value(4));
let id_variant = Variant::new(
binary_array_value(metadata.as_ref(), 4).unwrap(),
binary_array_value(id_value.as_ref(), 4).unwrap(),
);
assert_eq!(id_variant, Variant::from(12345i64));

assert!(session_id_value.is_null(4));
Expand Down
Loading
Loading