Skip to content

Commit be21abf

Browse files
committed
Initial work
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent aa9432c commit be21abf

6 files changed

Lines changed: 333 additions & 229 deletions

File tree

parquet-variant-compute/src/shred_variant.rs

Lines changed: 95 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant
9696
let (value, typed_value, nulls) = builder.finish()?;
9797
Ok(VariantArray::from_parts(
9898
array.metadata_field().clone(),
99-
Some(value),
99+
Some(Arc::new(value) as ArrayRef),
100100
Some(typed_value),
101101
nulls,
102102
))
@@ -408,8 +408,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
408408
let mut builder = StructArrayBuilder::new();
409409
for (field_name, typed_value_builder) in self.typed_value_builders {
410410
let (value, typed_value, nulls) = typed_value_builder.finish()?;
411-
let array =
412-
ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
411+
let array = ShreddedVariantFieldArray::from_parts(
412+
Some(Arc::new(value) as ArrayRef),
413+
Some(typed_value),
414+
nulls,
415+
);
413416
builder = builder.with_field(field_name, ArrayRef::from(array), false);
414417
}
415418
if let Some(nulls) = self.typed_value_nulls.finish() {
@@ -654,6 +657,7 @@ impl VariantSchemaNode {
654657
mod tests {
655658
use super::*;
656659
use crate::VariantArrayBuilder;
660+
use crate::variant_array::binary_array_value;
657661
use arrow::array::{
658662
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
659663
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
@@ -826,7 +830,8 @@ mod tests {
826830
) {
827831
assert_eq!(array.len(), expected_len);
828832

829-
let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field()));
833+
let fallback_value = array.value_field().unwrap();
834+
let fallback_metadata = array.metadata_field();
830835
let array = downcast_list_like_array::<O>(array);
831836

832837
assert_eq!(
@@ -846,7 +851,7 @@ mod tests {
846851
);
847852
assert_eq!(
848853
array.len(),
849-
fallbacks.0.len(),
854+
fallback_value.len(),
850855
"fallbacks value field should match array length"
851856
);
852857

@@ -861,23 +866,28 @@ mod tests {
861866
// Successfully shredded: typed list value present, no fallback value
862867
assert!(array.is_valid(idx));
863868
assert_eq!(array.value_size(idx), *len);
864-
assert!(fallbacks.0.is_null(idx));
869+
assert!(fallback_value.is_null(idx));
865870
}
866871
None => {
867872
// Unable to shred: typed list value absent, fallback should carry the variant
868873
assert!(array.is_null(idx));
869874
assert_eq!(array.value_size(idx), O::zero());
870875
match expected_fallback {
871876
Some(expected_variant) => {
872-
assert!(fallbacks.0.is_valid(idx));
873-
let metadata_bytes = fallbacks
874-
.1
875-
.filter(|m| m.is_valid(idx))
876-
.map(|m| m.value(idx))
877-
.filter(|bytes| !bytes.is_empty())
878-
.unwrap_or(EMPTY_VARIANT_METADATA_BYTES);
877+
assert!(fallback_value.is_valid(idx));
878+
let metadata_bytes =
879+
binary_array_value(fallback_metadata.as_ref(), idx);
880+
let metadata_bytes =
881+
if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() {
882+
metadata_bytes
883+
} else {
884+
EMPTY_VARIANT_METADATA_BYTES
885+
};
879886
assert_eq!(
880-
Variant::new(metadata_bytes, fallbacks.0.value(idx)),
887+
Variant::new(
888+
metadata_bytes,
889+
binary_array_value(fallback_value.as_ref(), idx)
890+
),
881891
expected_variant.clone()
882892
);
883893
}
@@ -940,7 +950,10 @@ mod tests {
940950
Some(expected_variant) => {
941951
assert!(element_fallbacks.is_valid(idx));
942952
assert_eq!(
943-
Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)),
953+
Variant::new(
954+
EMPTY_VARIANT_METADATA_BYTES,
955+
binary_array_value(element_fallbacks.as_ref(), idx)
956+
),
944957
expected_variant.clone()
945958
);
946959
}
@@ -971,7 +984,7 @@ mod tests {
971984
#[test]
972985
fn test_all_null_input() {
973986
// Create VariantArray with no value field (all null case)
974-
let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata
987+
let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata
975988
let all_null_array = VariantArray::from_parts(metadata, None, None, None);
976989
let result = shred_variant(&all_null_array, &DataType::Int64).unwrap();
977990

@@ -1085,7 +1098,10 @@ mod tests {
10851098
assert!(!value_field.is_null(1)); // value should contain original
10861099
assert!(typed_value_field.is_null(1)); // typed_value should be null
10871100
assert_eq!(
1088-
Variant::new(metadata_field.value(1), value_field.value(1)),
1101+
Variant::new(
1102+
binary_array_value(metadata_field.as_ref(), 1),
1103+
binary_array_value(value_field.as_ref(), 1)
1104+
),
10891105
Variant::from("hello")
10901106
);
10911107

@@ -1101,7 +1117,10 @@ mod tests {
11011117
assert!(!result.is_null(4));
11021118
assert!(!value_field.is_null(4)); // should contain Variant::Null
11031119
assert_eq!(
1104-
Variant::new(metadata_field.value(4), value_field.value(4)),
1120+
Variant::new(
1121+
binary_array_value(metadata_field.as_ref(), 4),
1122+
binary_array_value(value_field.as_ref(), 4)
1123+
),
11051124
Variant::Null
11061125
);
11071126
assert!(typed_value_field.is_null(4));
@@ -1178,7 +1197,10 @@ mod tests {
11781197
assert!(value.is_valid(1));
11791198
assert!(typed_value.is_null(1));
11801199
assert_eq!(
1181-
Variant::new(metadata.value(1), value.value(1)),
1200+
Variant::new(
1201+
binary_array_value(metadata.as_ref(), 1),
1202+
binary_array_value(value.as_ref(), 1)
1203+
),
11821204
Variant::from(42i64)
11831205
);
11841206

@@ -1192,7 +1214,10 @@ mod tests {
11921214
assert!(value.is_valid(3));
11931215
assert!(typed_value.is_null(3));
11941216
assert_eq!(
1195-
Variant::new(metadata.value(3), value.value(3)),
1217+
Variant::new(
1218+
binary_array_value(metadata.as_ref(), 3),
1219+
binary_array_value(value.as_ref(), 3)
1220+
),
11961221
Variant::Null
11971222
);
11981223

@@ -1234,7 +1259,10 @@ mod tests {
12341259
assert!(value.is_valid(1));
12351260
assert!(typed_value.is_null(1));
12361261
assert_eq!(
1237-
Variant::new(metadata.value(1), value.value(1)),
1262+
Variant::new(
1263+
binary_array_value(metadata.as_ref(), 1),
1264+
binary_array_value(value.as_ref(), 1)
1265+
),
12381266
Variant::from("not_binary")
12391267
);
12401268

@@ -1248,7 +1276,10 @@ mod tests {
12481276
assert!(value.is_valid(3));
12491277
assert!(typed_value.is_null(3));
12501278
assert_eq!(
1251-
Variant::new(metadata.value(3), value.value(3)),
1279+
Variant::new(
1280+
binary_array_value(metadata.as_ref(), 3),
1281+
binary_array_value(value.as_ref(), 3)
1282+
),
12521283
Variant::Null
12531284
);
12541285

@@ -1541,14 +1572,14 @@ mod tests {
15411572
.unwrap();
15421573
let outer_fallbacks = outer_elements.value_field().unwrap();
15431574

1544-
let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1575+
let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
15451576
EMPTY_VARIANT_METADATA_BYTES,
15461577
outer_elements.len(),
1547-
));
1578+
))) as ArrayRef;
15481579
let outer_variant = VariantArray::from_parts(
15491580
outer_metadata,
15501581
Some(outer_fallbacks.clone()),
1551-
Some(Arc::new(outer_values.clone())),
1582+
Some(Arc::new(outer_values.clone()) as ArrayRef),
15521583
None,
15531584
);
15541585

@@ -1651,7 +1682,10 @@ mod tests {
16511682
// null is stored as Variant::Null in values
16521683
assert!(id_values.is_valid(1));
16531684
assert_eq!(
1654-
Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)),
1685+
Variant::new(
1686+
EMPTY_VARIANT_METADATA_BYTES,
1687+
binary_array_value(id_values.as_ref(), 1)
1688+
),
16551689
Variant::Null
16561690
);
16571691
assert!(id_typed_values.is_null(1));
@@ -1725,7 +1759,6 @@ mod tests {
17251759
assert_eq!(result.len(), 9);
17261760

17271761
let metadata = result.metadata_field();
1728-
17291762
let value = result.value_field().unwrap();
17301763
let typed_value = result
17311764
.typed_value_field()
@@ -1741,24 +1774,14 @@ mod tests {
17411774
let age_field =
17421775
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap();
17431776

1744-
let score_value = score_field
1745-
.value_field()
1746-
.unwrap()
1747-
.as_any()
1748-
.downcast_ref::<BinaryViewArray>()
1749-
.unwrap();
1777+
let score_value = score_field.value_field().unwrap();
17501778
let score_typed_value = score_field
17511779
.typed_value_field()
17521780
.unwrap()
17531781
.as_any()
17541782
.downcast_ref::<Float64Array>()
17551783
.unwrap();
1756-
let age_value = age_field
1757-
.value_field()
1758-
.unwrap()
1759-
.as_any()
1760-
.downcast_ref::<BinaryViewArray>()
1761-
.unwrap();
1784+
let age_value = age_field.value_field().unwrap();
17621785
let age_typed_value = age_field
17631786
.typed_value_field()
17641787
.unwrap()
@@ -1777,10 +1800,13 @@ mod tests {
17771800
}
17781801
fn get_value<'m, 'v>(
17791802
i: usize,
1780-
metadata: &'m BinaryViewArray,
1781-
value: &'v BinaryViewArray,
1803+
metadata: &'m dyn Array,
1804+
value: &'v dyn Array,
17821805
) -> Variant<'m, 'v> {
1783-
Variant::new(metadata.value(i), value.value(i))
1806+
Variant::new(
1807+
binary_array_value(metadata, i),
1808+
binary_array_value(value, i),
1809+
)
17841810
}
17851811
let expect = |i, expected_result: Option<ShreddedValue<ShreddedStruct>>| {
17861812
match expected_result {
@@ -1792,7 +1818,10 @@ mod tests {
17921818
match expected_value {
17931819
Some(expected_value) => {
17941820
assert!(value.is_valid(i));
1795-
assert_eq!(expected_value, get_value(i, metadata, value));
1821+
assert_eq!(
1822+
expected_value,
1823+
get_value(i, metadata.as_ref(), value.as_ref())
1824+
);
17961825
}
17971826
None => {
17981827
assert!(value.is_null(i));
@@ -1811,7 +1840,7 @@ mod tests {
18111840
assert!(score_value.is_valid(i));
18121841
assert_eq!(
18131842
expected_score_value,
1814-
get_value(i, metadata, score_value)
1843+
get_value(i, metadata.as_ref(), score_value.as_ref())
18151844
);
18161845
}
18171846
None => {
@@ -1832,7 +1861,7 @@ mod tests {
18321861
assert!(age_value.is_valid(i));
18331862
assert_eq!(
18341863
expected_age_value,
1835-
get_value(i, metadata, age_value)
1864+
get_value(i, metadata.as_ref(), age_value.as_ref())
18361865
);
18371866
}
18381867
None => {
@@ -1973,7 +2002,7 @@ mod tests {
19732002
// Helper to correctly create a variant object using a row's existing metadata
19742003
let object_with_foo_field = |i| {
19752004
use parquet_variant::{ParentState, ValueBuilder, VariantMetadata};
1976-
let metadata = VariantMetadata::new(metadata.value(i));
2005+
let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i));
19772006
let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
19782007
let mut value_builder = ValueBuilder::new();
19792008
let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
@@ -2072,7 +2101,10 @@ mod tests {
20722101
assert!(value_field.is_null(2));
20732102
assert!(value_field.is_valid(3));
20742103
assert_eq!(
2075-
Variant::new(result.metadata_field().value(3), value_field.value(3)),
2104+
Variant::new(
2105+
binary_array_value(result.metadata_field().as_ref(), 3),
2106+
binary_array_value(value_field.as_ref(), 3)
2107+
),
20762108
Variant::from("not an object")
20772109
);
20782110
assert!(value_field.is_null(4));
@@ -2090,10 +2122,10 @@ mod tests {
20902122
.unwrap();
20912123
assert_list_structure_and_elements::<Int64Type, i32>(
20922124
&VariantArray::from_parts(
2093-
BinaryViewArray::from_iter_values(std::iter::repeat_n(
2125+
Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
20942126
EMPTY_VARIANT_METADATA_BYTES,
20952127
scores_field.len(),
2096-
)),
2128+
))) as ArrayRef,
20972129
Some(scores_field.value_field().unwrap().clone()),
20982130
Some(scores_field.typed_value_field().unwrap().clone()),
20992131
None,
@@ -2215,24 +2247,14 @@ mod tests {
22152247
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
22162248
.unwrap();
22172249

2218-
let id_value = id_field
2219-
.value_field()
2220-
.unwrap()
2221-
.as_any()
2222-
.downcast_ref::<BinaryViewArray>()
2223-
.unwrap();
2250+
let id_value = id_field.value_field().unwrap();
22242251
let id_typed_value = id_field
22252252
.typed_value_field()
22262253
.unwrap()
22272254
.as_any()
22282255
.downcast_ref::<FixedSizeBinaryArray>()
22292256
.unwrap();
2230-
let session_id_value = session_id_field
2231-
.value_field()
2232-
.unwrap()
2233-
.as_any()
2234-
.downcast_ref::<BinaryViewArray>()
2235-
.unwrap();
2257+
let session_id_value = session_id_field.value_field().unwrap();
22362258
let session_id_typed_value = session_id_field
22372259
.typed_value_field()
22382260
.unwrap()
@@ -2269,7 +2291,10 @@ mod tests {
22692291
assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
22702292

22712293
// Verify the value field contains the name field
2272-
let row_1_variant = Variant::new(metadata.value(1), value.value(1));
2294+
let row_1_variant = Variant::new(
2295+
binary_array_value(metadata.as_ref(), 1),
2296+
binary_array_value(value.as_ref(), 1),
2297+
);
22732298
let Variant::Object(obj) = row_1_variant else {
22742299
panic!("Expected object");
22752300
};
@@ -2301,7 +2326,10 @@ mod tests {
23012326

23022327
assert!(session_id_value.is_valid(3)); // type mismatch, stored in value
23032328
assert!(session_id_typed_value.is_null(3));
2304-
let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3));
2329+
let session_id_variant = Variant::new(
2330+
binary_array_value(metadata.as_ref(), 3),
2331+
binary_array_value(session_id_value.as_ref(), 3),
2332+
);
23052333
assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
23062334

23072335
// Row 4: Type mismatch - id is int64, not UUID
@@ -2312,7 +2340,10 @@ mod tests {
23122340

23132341
assert!(id_value.is_valid(4)); // type mismatch, stored in value
23142342
assert!(id_typed_value.is_null(4));
2315-
let id_variant = Variant::new(metadata.value(4), id_value.value(4));
2343+
let id_variant = Variant::new(
2344+
binary_array_value(metadata.as_ref(), 4),
2345+
binary_array_value(id_value.as_ref(), 4),
2346+
);
23162347
assert_eq!(id_variant, Variant::from(12345i64));
23172348

23182349
assert!(session_id_value.is_null(4));

0 commit comments

Comments
 (0)