Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 63 additions & 73 deletions arrow-cast/src/cast/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ where

/// Parses given string to specified decimal native (i128/i256) based on given
/// scale. Returns an `Err` if it cannot parse given string.
pub(crate) fn parse_string_to_decimal_native<T: DecimalType>(
pub fn parse_string_to_decimal_native<T: DecimalType>(
value_str: &str,
scale: usize,
) -> Result<T::Native, ArrowError>
Expand Down Expand Up @@ -777,15 +777,15 @@ where
if cast_options.safe {
array
.unary_opt::<_, D>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.filter(|v| D::is_valid_decimal_precision(*v, precision))
})
.with_precision_and_scale(precision, scale)
.map(|a| Arc::new(a) as ArrayRef)
} else {
array
.try_unary::<_, D, _>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.ok_or_else(|| {
ArrowError::CastError(format!(
"Cannot cast to {}({}, {}). Overflowing on {:?}",
Expand All @@ -802,6 +802,17 @@ where
}
}

/// Cast a single floating point value to a decimal native with the given multiple.
/// Returns `None` if the value cannot be represented with the requested precision.
#[inline]
pub fn single_float_to_decimal<D>(input: f64, mul: f64) -> Option<D::Native>
where
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: DecimalCast,
{
D::Native::from_f64((mul * input).round())
Comment thread
scovich marked this conversation as resolved.
}

pub(crate) fn cast_decimal_to_integer<D, T>(
array: &dyn Array,
base: D::Native,
Expand All @@ -826,84 +837,63 @@ where

let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());

if scale < 0 {
match cast_options.safe {
true => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array
.value(i)
.mul_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
value_builder.append_option(v);
}
}
}
false => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).mul_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

value_builder.append_value(value);
}
}
}
}
} else {
match cast_options.safe {
true => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array
.value(i)
.div_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
value_builder.append_option(v);
}
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
match cast_options.safe {
true => {
let v = cast_single_decimal_to_integer::<D, T::Native>(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code hoisted checks for scale < 0 (mul_checked vs. div_checked) and cast_options.safe (NULL vs. error) outside the loop, producing four simple loop bodies. This was presumably done for performance reasons (minimizing branching inside the loop).

The new code pushes the cast_options.safe check inside a single loop and pushes scale < 0 check all the way down inside cast_single_decimal_to_integer. That triples the number of branches inside the loop body (the null check is per-row and so is always stuck inside the loop). Performance will almost certainly be impacted, possibly significantly.

It would be safer to just preserve the replication (even tho it duplicates logic with the new helper), and rely on the compiler's inlining and "jump threading" optimizations to eliminate that redundancy:

code snippet
if scale < 0 {
    if cast_options.safe {
        for i in 0..array.len() {
            if array.is_null(i) {
                value_builder.append_null();
            } else {
                let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                value_builder.append_option(v.ok());
            }
        }
    } else {
        for i in 0..array.len() {
            if array.is_null(i) {
                value_builder.append_null();
            } else {
                let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                value_builder.append_value(v?);
            }
        }
    }
} else {
    if cast_options.safe {
        for i in 0..array.len() {
            if array.is_null(i) {
                value_builder.append_null();
            } else {
                let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                value_builder.append_option(v.ok());
            }
        }
    } else {
        for i in 0..array.len() {
            if array.is_null(i) {
                value_builder.append_null();
            } else {
                let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                value_builder.append_value(v?);
            }
        }
    }
}

If you wanted to simplify a bit, you could define and use a local macro inside this function:

// Helper macro for emitting nearly the same loop every time, so we can hoist branches out.
// The compiler will specialize the resulting code (inlining and jump threading)
macro_rules! cast_loop {
    (|$v:ident| $body:expr) => {{
        for i in 0..array.len() {
            if array.is_null(i) {
                value_builder.append_null();
            } else {
                let $v = cast_single_decimal_to_integer::<D, T::Native>(...);
                $body
            }
        }
    }};
}

if scale < 0 {
    if cast_options.safe {
        cast_loop!(|v| value_builder.append_option(v.ok()));
    } else {
        cast_loop!(|v| value_builder.append_value(v?));
    }
} else {
    if cast_options.safe {
        cast_loop!(|v| value_builder.append_option(v.ok()));
    } else {
        cast_loop!(|v| value_builder.append_value(v?));
    }
}

Note that the four loop bodies are almost syntactically identical -- differing only in whether they append_option(v.ok()) or append_value(v?) -- but the inlined body of cast_single_decimal_to_integer inside each loop will be specialized based on the scale < 0 check we already performed. Result: stand-alone calls to the helper function are always safe, but we still get maximum performance here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed explain. fixed.

array.value(i),
div,
scale as _,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we casting? Isn't it a trivial i16 -> i16 cast?
(again below)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid the overlfow cast_single_decimal_to_integer receives i16, the scale of arrow decimal is i8 and VariantDecimal is u8.

T::DATA_TYPE,
)
.ok();
value_builder.append_option(v);
}
}
false => {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).div_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

value_builder.append_value(value);
}
false => {
let value = cast_single_decimal_to_integer::<D, T::Native>(
array.value(i),
div,
scale as _,
T::DATA_TYPE,
)?;

value_builder.append_value(value);
}
}
}
}

Ok(Arc::new(value_builder.finish()))
}

/// Casting a given decimal to an integer based on given div and scale.
/// The value is scaled by multiplying or dividing with the div based on the scale sign.
/// Returns `Err` if the value is overflow or cannot be represented with the requested precision.
pub fn cast_single_decimal_to_integer<D, T>(
value: D::Native,
div: D::Native,
scale: i16,
type_name: DataType,
) -> Result<T, ArrowError>
where
T: NumCast + ToPrimitive,
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: ToPrimitive,
{
let v = if scale < 0 {
value.mul_checked(div)?
} else {
value.div_checked(div)?
};

T::from::<D::Native>(v).ok_or_else(|| {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did not unify these two functions, because if I unify them with a common function like

fn cast_single_decimal_to_integer<D, T>(...) -> Result<Option<T>, ArrowError>> {
let v = if negative {
        value.mul_checked(div)?
    } else {
        value.div_checked(div)?
    };
OK(T::from::<D::Native>(v))
}

Then, in the caller function, I can't the value of v above, this make the error msg in cast_single_decimal_to_integer_result wrong.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah tricky indeed.

ArrowError::CastError(format!("value of {:?} is out of range {:?}", v, type_name))
})
}

/// Cast a decimal array to a floating point array.
///
/// Conversion is lossy and follows standard floating point semantics. Values
Expand Down
23 changes: 20 additions & 3 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,26 @@ use arrow_schema::*;
use arrow_select::take::take;
use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};

pub use decimal::{DecimalCast, rescale_decimal};
pub use decimal::{
DecimalCast, cast_single_decimal_to_integer, parse_string_to_decimal_native, rescale_decimal,
single_float_to_decimal,
};
pub use string::cast_single_string_to_boolean_default;

/// Lossy conversion from decimal to float.
///
/// Conversion is lossy and follows standard floating point semantics. Values
/// that exceed the representable range become `INFINITY` or `-INFINITY` without
/// returning an error.
#[inline]
pub fn single_decimal_to_float_lossy<D, F>(f: &F, x: D::Native, scale: i32) -> f64
where
D: DecimalType,
F: Fn(D::Native) -> f64,
{
f(x) / 10_f64.powi(scale)
Comment thread
klion26 marked this conversation as resolved.
}

/// CastOptions provides a way to override the default cast behaviors
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct CastOptions<'a> {
Expand Down Expand Up @@ -2314,10 +2331,10 @@ where
Int32 => cast_decimal_to_integer::<D, Int32Type>(array, base, *scale, cast_options),
Int64 => cast_decimal_to_integer::<D, Int64Type>(array, base, *scale, cast_options),
Float32 => cast_decimal_to_float::<D, Float32Type, _>(array, |x| {
(as_float(x) / 10_f64.powi(*scale as i32)) as f32
single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _) as f32
}),
Float64 => cast_decimal_to_float::<D, Float64Type, _>(array, |x| {
as_float(x) / 10_f64.powi(*scale as i32)
single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're anyway changing the code, i32::from(*scale) makes clear that this is a lossless conversion

(a bunch more similarly lossless as _ below)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, i32::from(*scale) is better, fixed.

}),
Utf8View => value_to_string_view(array, cast_options),
Utf8 => value_to_string::<i32>(array, cast_options),
Expand Down
16 changes: 14 additions & 2 deletions parquet-variant-compute/src/type_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

//! Module for transforming a typed arrow `Array` to `VariantArray`.

use arrow::compute::{CastOptions, DecimalCast, rescale_decimal};
use arrow::compute::{
CastOptions, DecimalCast, parse_string_to_decimal_native, rescale_decimal,
single_float_to_decimal,
};
use arrow::datatypes::{
self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type,
DecimalType,
Expand Down Expand Up @@ -204,9 +207,12 @@ impl_timestamp_from_variant!(
///
/// - `precision` and `scale` specify the target Arrow decimal parameters
/// - Integer variants (`Int8/16/32/64`) are treated as decimals with scale 0
/// - Floating point variants (`Float/Double`) are converted to decimals with the given scale
/// - String variants (`String/ShortString`) are parsed as decimals with the given scale
/// - Decimal variants (`Decimal4/8/16`) use their embedded precision and scale
///
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` and
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` for integers,
/// `single_float_to_decimal` for floats, and `parse_string_to_decimal_native` for strings.
/// returns `None` if it cannot fit the requested precision.
pub(crate) fn variant_to_unscaled_decimal<O>(
variant: &Variant<'_, '_>,
Expand All @@ -217,6 +223,8 @@ where
O: DecimalType,
O::Native: DecimalCast,
{
let mul = 10_f64.powi(scale as i32);

match variant {
Variant::Int8(i) => rescale_decimal::<Decimal32Type, O>(
*i as i32,
Expand Down Expand Up @@ -246,6 +254,10 @@ where
precision,
scale,
),
Variant::Float(f) => single_float_to_decimal::<O>(*f as _, mul),
Variant::Double(f) => single_float_to_decimal::<O>(*f, mul),
Variant::String(v) => parse_string_to_decimal_native::<O>(v, scale as _).ok(),
Variant::ShortString(v) => parse_string_to_decimal_native::<O>(v, scale as _).ok(),
Variant::Decimal4(d) => rescale_decimal::<Decimal32Type, O>(
d.integer(),
VariantDecimal4::MAX_PRECISION,
Expand Down
Loading
Loading