Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 81 additions & 35 deletions arrow-cast/src/cast/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ where

/// Parses given string to specified decimal native (i128/i256) based on given
/// scale. Returns an `Err` if it cannot parse given string.
pub(crate) fn parse_string_to_decimal_native<T: DecimalType>(
pub fn parse_string_to_decimal_native<T: DecimalType>(
value_str: &str,
scale: usize,
) -> Result<T::Native, ArrowError>
Expand Down Expand Up @@ -777,15 +777,15 @@ where
if cast_options.safe {
array
.unary_opt::<_, D>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.filter(|v| D::is_valid_decimal_precision(*v, precision))
})
.with_precision_and_scale(precision, scale)
.map(|a| Arc::new(a) as ArrayRef)
} else {
array
.try_unary::<_, D, _>(|v| {
D::Native::from_f64((mul * v.as_()).round())
single_float_to_decimal::<D>(v.as_(), mul)
.ok_or_else(|| {
ArrowError::CastError(format!(
"Cannot cast to {}({}, {}). Overflowing on {:?}",
Expand All @@ -802,6 +802,17 @@ where
}
}

/// Cast a single floating point value to a decimal native with the given multiple.
/// Returns `None` if the value cannot be represented with the requested precision.
#[inline]
pub fn single_float_to_decimal<D>(input: f64, mul: f64) -> Option<D::Native>
where
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: DecimalCast,
{
D::Native::from_f64((mul * input).round())
Comment thread
scovich marked this conversation as resolved.
}

pub(crate) fn cast_decimal_to_integer<D, T>(
array: &dyn Array,
base: D::Native,
Expand Down Expand Up @@ -833,11 +844,11 @@ where
if array.is_null(i) {
value_builder.append_null();
} else {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed back to the if/esle and match pattern, because

  1. We need to distinguish the logic in safe and no-safe path because of a performance problem, in the last commit, we will construct an arrowerror(will call format!) and drop it in safe mode, this have 50%+ performance regression in benchmark.
  2. After step 1, seems there is little gain to union the logic here

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I totally missed the spurious error allocation pitfall 🤦. Glad your benchmarking uncovered it!

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you really wanted to unify without the overhead, a helper that returns Result<T, D::Native> should do the trick: Ok(v) is a valid value, and Err(v) is the out of gamut value. The value would be super cheap, and safe path does .ok() while unsafe path does .map(|v| ArrowError::CastError(...)).

But probably not worth it, especially given that the checked mul/div also produce ArrowError via ?.

let v = array
.value(i)
.mul_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
let v = cast_single_decimal_to_integer_opt::<D, T::Native>(
array.value(i),
div,
true,
);
value_builder.append_option(v);
}
}
Expand All @@ -847,17 +858,12 @@ where
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).mul_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

let value = cast_single_decimal_to_integer_result::<D, T::Native>(
array.value(i),
div,
true,
T::DATA_TYPE,
)?;
value_builder.append_value(value);
}
}
Expand All @@ -870,11 +876,11 @@ where
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array
.value(i)
.div_checked(div)
.ok()
.and_then(<T::Native as NumCast>::from::<D::Native>);
let v = cast_single_decimal_to_integer_opt::<D, T::Native>(
array.value(i),
div,
false,
);
value_builder.append_option(v);
}
}
Expand All @@ -884,26 +890,66 @@ where
if array.is_null(i) {
value_builder.append_null();
} else {
let v = array.value(i).div_checked(div)?;

let value =
<T::Native as NumCast>::from::<D::Native>(v).ok_or_else(|| {
ArrowError::CastError(format!(
"value of {:?} is out of range {}",
v,
T::DATA_TYPE
))
})?;

let value = cast_single_decimal_to_integer_result::<D, T::Native>(
array.value(i),
div,
false,
T::DATA_TYPE,
)?;
value_builder.append_value(value);
}
}
}
}
}

Ok(Arc::new(value_builder.finish()))
}

/// Casting a given decimal to an integer based on given div and scale.
/// The value is scaled by multiplying or dividing with the div based on the scale sign.
/// Returns `None` if the value is overflow or cannot be represented with the requested precision.
#[inline]
pub fn cast_single_decimal_to_integer_opt<D, T>(
value: D::Native,
div: D::Native,
negative: bool,
) -> Option<T>
where
T: NumCast + ToPrimitive,
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: ToPrimitive,
{
let v = if negative {
value.mul_checked(div).ok()?
} else {
value.div_checked(div).ok()?
};
T::from::<D::Native>(v)
}

#[inline]
fn cast_single_decimal_to_integer_result<D, T>(
value: D::Native,
div: D::Native,
negative: bool,
type_name: DataType,
) -> Result<T, ArrowError>
where
T: NumCast + ToPrimitive,
D: DecimalType + ArrowPrimitiveType,
<D as ArrowPrimitiveType>::Native: ToPrimitive,
{
let v = if negative {
value.mul_checked(div)?
} else {
value.div_checked(div)?
};
T::from::<D::Native>(v).ok_or_else(|| {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did not unify these two functions, because if I unify them with a common function like

fn cast_single_decimal_to_integer<D, T>(...) -> Result<Option<T>, ArrowError>> {
let v = if negative {
        value.mul_checked(div)?
    } else {
        value.div_checked(div)?
    };
OK(T::from::<D::Native>(v))
}

Then, in the caller function, I can't the value of v above, this make the error msg in cast_single_decimal_to_integer_result wrong.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah tricky indeed.

ArrowError::CastError(format!("value of {:?} is out of range {:?}", v, type_name))
})
}

/// Cast a decimal array to a floating point array.
///
/// Conversion is lossy and follows standard floating point semantics. Values
Expand Down
24 changes: 21 additions & 3 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,26 @@ use arrow_schema::*;
use arrow_select::take::take;
use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};

pub use decimal::{DecimalCast, rescale_decimal};
pub use decimal::{
DecimalCast, cast_single_decimal_to_integer_opt, parse_string_to_decimal_native,
rescale_decimal, single_float_to_decimal,
};
pub use string::cast_single_string_to_boolean_default;

/// Lossy conversion from decimal to float.
///
/// Conversion is lossy and follows standard floating point semantics. Values
/// that exceed the representable range become `INFINITY` or `-INFINITY` without
/// returning an error.
#[inline]
pub fn single_decimal_to_float_lossy<D, F>(f: &F, x: D::Native, scale: i32) -> f64
where
D: DecimalType,
F: Fn(D::Native) -> f64,
{
f(x) / 10_f64.powi(scale)
}

/// CastOptions provides a way to override the default cast behaviors
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct CastOptions<'a> {
Expand Down Expand Up @@ -2314,10 +2331,11 @@ where
Int32 => cast_decimal_to_integer::<D, Int32Type>(array, base, *scale, cast_options),
Int64 => cast_decimal_to_integer::<D, Int64Type>(array, base, *scale, cast_options),
Float32 => cast_decimal_to_float::<D, Float32Type, _>(array, |x| {
(as_float(x) / 10_f64.powi(*scale as i32)) as f32
single_decimal_to_float_lossy::<D, F>(&as_float, x, <i32 as From<i8>>::from(*scale))
as f32
}),
Float64 => cast_decimal_to_float::<D, Float64Type, _>(array, |x| {
as_float(x) / 10_f64.powi(*scale as i32)
single_decimal_to_float_lossy::<D, F>(&as_float, x, <i32 as From<i8>>::from(*scale))
}),
Utf8View => value_to_string_view(array, cast_options),
Utf8 => value_to_string::<i32>(array, cast_options),
Expand Down
20 changes: 18 additions & 2 deletions parquet-variant-compute/src/type_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

//! Module for transforming a typed arrow `Array` to `VariantArray`.

use arrow::compute::{CastOptions, DecimalCast, rescale_decimal};
use arrow::compute::{
CastOptions, DecimalCast, parse_string_to_decimal_native, rescale_decimal,
single_float_to_decimal,
};
use arrow::datatypes::{
self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type,
DecimalType,
Expand Down Expand Up @@ -204,9 +207,12 @@ impl_timestamp_from_variant!(
///
/// - `precision` and `scale` specify the target Arrow decimal parameters
/// - Integer variants (`Int8/16/32/64`) are treated as decimals with scale 0
/// - Floating point variants (`Float/Double`) are converted to decimals with the given scale
/// - String variants (`String/ShortString`) are parsed as decimals with the given scale
/// - Decimal variants (`Decimal4/8/16`) use their embedded precision and scale
///
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` and
/// The value is rescaled to (`precision`, `scale`) using `rescale_decimal` for integers,
/// `single_float_to_decimal` for floats, and `parse_string_to_decimal_native` for strings.
/// returns `None` if it cannot fit the requested precision.
pub(crate) fn variant_to_unscaled_decimal<O>(
variant: &Variant<'_, '_>,
Expand All @@ -217,6 +223,8 @@ where
O: DecimalType,
O::Native: DecimalCast,
{
let mul = 10_f64.powi(scale as i32);

match variant {
Variant::Int8(i) => rescale_decimal::<Decimal32Type, O>(
*i as i32,
Expand Down Expand Up @@ -246,6 +254,14 @@ where
precision,
scale,
),
Variant::Float(f) => single_float_to_decimal::<O>(f64::from(*f), mul),
Variant::Double(f) => single_float_to_decimal::<O>(*f, mul),
// arrow-cast only support cast string to decimal with scale >=0 for now
// Please see `cast_string_to_decimal` in arrow-cast/src/cast/decimal.rs for more detail
Variant::String(v) if scale >= 0 => parse_string_to_decimal_native::<O>(v, scale as _).ok(),
Variant::ShortString(v) if scale >= 0 => {
parse_string_to_decimal_native::<O>(v, scale as _).ok()
}
Variant::Decimal4(d) => rescale_decimal::<Decimal32Type, O>(
d.integer(),
VariantDecimal4::MAX_PRECISION,
Expand Down
Loading
Loading