diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 83df6879aa5..ad299dbc2ab 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, + array::{self, Array, ArrayRef, BinaryViewArray, StructArray, make_array}, + buffer::NullBuffer, compute::CastOptions, datatypes::Field, error::Result, @@ -261,8 +262,24 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti // 2. If every row in the `value` column is null // This is a perfect shredding, where the value is entirely shredded out, - // so we can just return the typed value. - return Some(typed_value.clone()); + // so we can just return the typed value after merging the accumulated nulls. + let parent_nulls = variant_array.nulls(); + + // If we have no nulls OR the shredded array is `Null`, which doesn't support external nulls. + let target_array = if parent_nulls.is_none() || typed_value.data_type().is_null() { + typed_value.clone() + } else { + let merged_nulls = NullBuffer::union(parent_nulls, typed_value.nulls()); + let data = typed_value + .to_data() + .into_builder() + .nulls(merged_nulls) + .build() + .ok()?; + make_array(data) + }; + + return Some(target_array); } None } @@ -346,7 +363,7 @@ mod test { Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray, - ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray, + ListViewArray, NullArray, NullBuilder, StringArray, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; @@ -4321,4 +4338,135 @@ mod test { ); } } + + macro_rules! perfectly_shredded_preserves_top_level_nulls_test { + ($name:ident, $result_type:expr, $typed_value:expr, $expected_array:expr) => { + perfectly_shredded_preserves_top_level_nulls_test!( + $name, + $result_type, + $typed_value, + Some(NullBuffer::from(vec![true, false, true])), + $expected_array + ); + }; + ($name:ident, $result_type:expr, $typed_value:expr, $parent_nulls:expr, $expected_array:expr) => { + #[test] + fn $name() { + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + 3, + )); + let typed_value: ArrayRef = Arc::new($typed_value); + let variant_array: ArrayRef = + VariantArray::from_parts(metadata, None, Some(typed_value), $parent_nulls) + .into(); + + let result = variant_get( + &variant_array, + GetOptions::new().with_as_type(Some(FieldRef::from(Field::new( + "result", + $result_type, + true, + )))), + ) + .unwrap(); + + let expected_array: ArrayRef = Arc::new($expected_array); + assert_eq!(&result, &expected_array); + } + }; + } + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_integer_preserves_top_level_nulls, + DataType::Int32, + Int32Array::from(vec![Some(0_i32), Some(1_i32), Some(2_i32)]), + Int32Array::from(vec![Some(0_i32), None, Some(2_i32)]) + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_integer_unions_child_and_top_level_nulls, + DataType::Int32, + Int32Array::from(vec![None, Some(1_i32), Some(2_i32)]), + Some(NullBuffer::from(vec![true, false, true])), + Int32Array::from(vec![None, None, Some(2_i32)]) + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_null_preserves_top_level_nulls, + DataType::Null, + NullArray::new(3), + NullArray::new(3) + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_binary_view_preserves_top_level_nulls, + DataType::BinaryView, + BinaryViewArray::from(vec![ + Some(b"Apache" as &[u8]), + Some(b"masked-null" as &[u8]), + Some(b"Parquet-variant" as &[u8]), + ]), + BinaryViewArray::from(vec![ + Some(b"Apache" as &[u8]), + None, + Some(b"Parquet-variant" as &[u8]), + ]) + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls, + DataType::Binary, + BinaryArray::from(vec![ + Some(b"Apache" as &[u8]), + Some(b"masked-null" as &[u8]), + Some(b"Parquet-variant" as &[u8]), + ]), + BinaryArray::from(vec![ + Some(b"Apache" as &[u8]), + None, + Some(b"Parquet-variant" as &[u8]), + ]) + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_decimal4_preserves_top_level_nulls, + DataType::Decimal32(5, 2), + Decimal32Array::from(vec![Some(12345), Some(23400), Some(-12342)]) + .with_precision_and_scale(5, 2) + .unwrap(), + Decimal32Array::from(vec![Some(12345), None, Some(-12342)]) + .with_precision_and_scale(5, 2) + .unwrap() + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_decimal8_preserves_top_level_nulls, + DataType::Decimal64(10, 1), + Decimal64Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)]) + .with_precision_and_scale(10, 1) + .unwrap(), + Decimal64Array::from(vec![Some(1234567809), None, Some(-1234561203)]) + .with_precision_and_scale(10, 1) + .unwrap() + ); + + perfectly_shredded_preserves_top_level_nulls_test!( + test_variant_get_perfectly_shredded_decimal16_preserves_top_level_nulls, + DataType::Decimal128(20, 3), + Decimal128Array::from(vec![ + Some(i128::from_str("12345678901234567899").unwrap()), + Some(i128::from_str("23445677483748324300").unwrap()), + Some(i128::from_str("-12345678901234567899").unwrap()), + ]) + .with_precision_and_scale(20, 3) + .unwrap(), + Decimal128Array::from(vec![ + Some(i128::from_str("12345678901234567899").unwrap()), + None, + Some(i128::from_str("-12345678901234567899").unwrap()), + ]) + .with_precision_and_scale(20, 3) + .unwrap() + ); }