From ab622ec899ebe6e447247d61eddb51f48d2e4cb3 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 26 Jul 2024 14:05:49 -0400 Subject: [PATCH 1/5] initial --- src/common/types/value/value.cpp | 20 +++++++++++++++++++ src/include/common/types/decimal_t.h | 20 +++++++++++++++++++ src/include/common/types/value/value.h | 5 +++++ src/parser/transform/transform_expression.cpp | 1 + 4 files changed, 46 insertions(+) create mode 100644 src/include/common/types/decimal_t.h diff --git a/src/common/types/value/value.cpp b/src/common/types/value/value.cpp index 9ff18a2ad05..ada46d2b2a7 100644 --- a/src/common/types/value/value.cpp +++ b/src/common/types/value/value.cpp @@ -264,6 +264,26 @@ Value::Value(double val_) : isNull_{false}, childrenSize{0} { val.doubleVal = val_; } +Value::Value(decimal_t val_) : isNull_{false}, childrenSize{0} { + dataType = LogicalType::DECIMAL(val_.precision, val_.scale); + switch (dataType.getPhysicalType()) { + case PhysicalTypeID::INT16: + val.int16Val = (int16_t)(val_.val); + break; + case PhysicalTypeID::INT32: + val.int32Val = (int32_t)(val_.val); + break; + case PhysicalTypeID::INT64: + val.int64Val = (int64_t)(val_.val); + break; + case PhysicalTypeID::INT128: + val.int128Val = val_.val; + break; + default: + KU_UNREACHABLE; + } +} + Value::Value(date_t val_) : isNull_{false}, childrenSize{0} { dataType = LogicalType::DATE(); val.int32Val = val_.days; diff --git a/src/include/common/types/decimal_t.h b/src/include/common/types/decimal_t.h new file mode 100644 index 00000000000..c011075f32a --- /dev/null +++ b/src/include/common/types/decimal_t.h @@ -0,0 +1,20 @@ +#pragma once + +#include "int128_t.h" + +namespace kuzu { +namespace common { + +struct KUZU_API decimal_t { + + int128_t val = 0; + uint32_t precision = 18; + uint32_t scale = 3; + + decimal_t() {} + decimal_t(int128_t val, uint32_t prec, uint32_t scale): + val(val), precision(prec), scale(scale) {} +}; + +} +} diff --git a/src/include/common/types/value/value.h b/src/include/common/types/value/value.h index 46fa717e2bc..bdb9b05ed25 100644 --- a/src/include/common/types/value/value.h +++ b/src/include/common/types/value/value.h @@ -4,6 +4,7 @@ #include "common/api.h" #include "common/types/date_t.h" +#include "common/types/decimal_t.h" #include "common/types/int128_t.h" #include "common/types/internal_id_t.h" #include "common/types/interval_t.h" @@ -99,6 +100,10 @@ class Value { * @param val_ the float value to set. */ KUZU_API explicit Value(float val_); + /** + * @param val_ the decimal_t value to set + */ + KUZU_API explicit Value(decimal_t val_); /** * @param val_ the date value to set. */ diff --git a/src/parser/transform/transform_expression.cpp b/src/parser/transform/transform_expression.cpp index ad557ca3203..b8bf8a00f12 100644 --- a/src/parser/transform/transform_expression.cpp +++ b/src/parser/transform/transform_expression.cpp @@ -1,6 +1,7 @@ #include "function/aggregate/count_star.h" #include "function/arithmetic/vector_arithmetic_functions.h" #include "function/cast/functions/cast_from_string_functions.h" +#include "function/cast/functions/cast_string_non_nested_functions.h" #include "function/list/vector_list_functions.h" #include "function/string/vector_string_functions.h" #include "function/struct/vector_struct_functions.h" From 3ae9ebe44baad6d364d1a30be60c7cc9d06ebbf0 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 26 Jul 2024 14:09:42 -0400 Subject: [PATCH 2/5] progress --- .../cast_string_non_nested_functions.cpp | 141 ++++++++++++++++++ .../cast_string_non_nested_functions.h | 5 + 2 files changed, 146 insertions(+) diff --git a/src/function/cast_string_non_nested_functions.cpp b/src/function/cast_string_non_nested_functions.cpp index 5ec298a8bb8..284ec6302cc 100644 --- a/src/function/cast_string_non_nested_functions.cpp +++ b/src/function/cast_string_non_nested_functions.cpp @@ -1,5 +1,10 @@ #include "function/cast/functions/cast_string_non_nested_functions.h" +#include "common/constants.h" +#include "common/types/timestamp_t.h" +#include "function/cast/functions/numeric_limits.h" +#include "re2.h" + namespace kuzu { namespace function { @@ -83,5 +88,141 @@ bool TryCastStringToTimestamp::tryCast(const char* input, uint6 return true; } +static bool isDate(const std::string& str) { + return RE2::FullMatch(str, "\\d{4}/\\d{1,2}/\\d{1,2}") || + RE2::FullMatch(str, "\\d{4}-\\d{1,2}-\\d{1,2}") || + RE2::FullMatch(str, "\\d{4} \\d{1,2} \\d{1,2}") || + RE2::FullMatch(str, "\\d{4}\\\\d{1,2}\\\\d{1,2}"); +} + +LogicalType inferMinimalTypeFromString(const std::string& str) { + constexpr char array_begin = common::CopyConstants::DEFAULT_CSV_LIST_BEGIN_CHAR; + constexpr char array_end = common::CopyConstants::DEFAULT_CSV_LIST_END_CHAR; + auto cpy = StringUtils::ltrim(StringUtils::rtrim(str)); + StringUtils::toUpper(cpy); + if (cpy.size() == 0) { + return LogicalType::STRING(); + } + // Boolean + if (cpy == "TRUE" || cpy == "FALSE") { + return LogicalType::BOOL(); + } + // Unsigned number + if (RE2::FullMatch(cpy, "(0|[1-9]\\d*)")) { + if (cpy.size() >= NumericLimits::digits()) { + return LogicalType::DOUBLE(); + } + int128_t val; + if (!trySimpleInt128Cast(cpy.c_str(), cpy.length(), val)) { + return LogicalType::STRING(); + } + if (val <= NumericLimits::maximum()) { + return LogicalType::UINT8(); + } + if (val <= NumericLimits::maximum()) { + return LogicalType::UINT16(); + } + if (val <= NumericLimits::maximum()) { + return LogicalType::UINT32(); + } + if (val <= NumericLimits::maximum()) { + return LogicalType::UINT64(); + } + return LogicalType::INT128(); + } + // Signed number + if (RE2::FullMatch(cpy, "-(0|[1-9]\\d*)")) { + if (cpy.size() >= 1 + NumericLimits::digits()) { + return LogicalType::DOUBLE(); + } + int128_t val; + if (!trySimpleInt128Cast(cpy.c_str(), cpy.length(), val)) { + return LogicalType::STRING(); + } + if (val >= NumericLimits::minimum()) { + return LogicalType::INT8(); + } + if (val >= NumericLimits::minimum()) { + return LogicalType::INT16(); + } + if (val >= NumericLimits::minimum()) { + return LogicalType::INT32(); + } + if (val >= NumericLimits::minimum()) { + return LogicalType::INT64(); + } + return LogicalType::INT128(); + } + // Real value checking + if (RE2::FullMatch(cpy, "-?(0|[1-9]\\d*)?\\.\\d*")) { + if (cpy[0] == '-') { + cpy.erase(cpy.begin()); + } + if (cpy.size() <= DECIMAL_PRECISION_LIMIT) { + auto decimalPoint = cpy.find('.'); + KU_ASSERT(decimalPoint != std::string::npos); + return LogicalType::DECIMAL(cpy.size() - 1, cpy.size() - decimalPoint - 1); + } else { + return LogicalType::DOUBLE(); + } + } + // date + if (isDate(cpy)) { + return LogicalType::DATE(); + } + // it might just be quicker to try cast to timestamp + timestamp_t tmp; + if (common::Timestamp::tryConvertTimestamp(cpy.c_str(), cpy.length(), tmp)) { + return LogicalType::TIMESTAMP(); + } + + if (cpy.front() == array_begin && cpy.back() == array_end) { + auto split = StringUtils::split(cpy.substr(1, cpy.size() - 2), ",", false); + auto childType = LogicalType::STRING(); + for (auto& ele : split) { + LogicalType combinedType; + if (!LogicalTypeUtils::tryGetMaxLogicalType(childType, inferMinimalTypeFromString(ele), + combinedType)) { + childType = LogicalType::STRING(); + break; + } + childType = std::move(combinedType); + } + return LogicalType::LIST(std::move(childType)); + } + + if (cpy.front() == '{' && cpy.back() == '}') { + auto split = StringUtils::split(cpy.substr(1, cpy.size() - 2), ",", false); + auto childKeyType = LogicalType::STRING(); + bool cannotResolveKey = false; + auto childValueType = LogicalType::STRING(); + bool cannotResolveValue = false; + for (auto& ele : split) { + LogicalType combinedKey = LogicalType::STRING(), combinedValue = LogicalType::STRING(); + auto splitEle = StringUtils::split(ele, "=", false); + if (splitEle.size() != 2) { + // invalid map; give string + return LogicalType::STRING(); + } + if (!cannotResolveKey && !LogicalTypeUtils::tryGetMaxLogicalType(childKeyType, + inferMinimalTypeFromString(splitEle[0]), combinedKey)) { + cannotResolveKey = true; + } + if (!cannotResolveValue && + !LogicalTypeUtils::tryGetMaxLogicalType(childValueType, + inferMinimalTypeFromString(splitEle[1]), combinedValue)) { + cannotResolveValue = true; + } + childKeyType = std::move(combinedKey); + childValueType = std::move(combinedValue); + if (cannotResolveKey && cannotResolveValue) { + break; + } + } + return LogicalType::MAP(std::move(childKeyType), std::move(childValueType)); + } + return LogicalType::STRING(); +} + } // namespace function } // namespace kuzu diff --git a/src/include/function/cast/functions/cast_string_non_nested_functions.h b/src/include/function/cast/functions/cast_string_non_nested_functions.h index c308b5e4b5d..1ee8fc84f05 100644 --- a/src/include/function/cast/functions/cast_string_non_nested_functions.h +++ b/src/include/function/cast/functions/cast_string_non_nested_functions.h @@ -15,6 +15,11 @@ using namespace kuzu::common; namespace kuzu { namespace function { +LogicalType inferMinimalTypeFromString(const std::string& str); +// Infer the type that the string represents +// Returns minimal type (eg. '123' returns uint8 and '1234' returns uint16) +// Used for sniffing + // cast string to numerical template struct IntegerCastData { From d563a01b75a40ad89a9a9ad50983c016f00fe431 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 26 Jul 2024 14:42:26 -0400 Subject: [PATCH 3/5] progress --- .../cast_string_non_nested_functions.h | 1 - src/parser/transform/transform_expression.cpp | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/include/function/cast/functions/cast_string_non_nested_functions.h b/src/include/function/cast/functions/cast_string_non_nested_functions.h index 1ee8fc84f05..d19b2d9a1d2 100644 --- a/src/include/function/cast/functions/cast_string_non_nested_functions.h +++ b/src/include/function/cast/functions/cast_string_non_nested_functions.h @@ -18,7 +18,6 @@ namespace function { LogicalType inferMinimalTypeFromString(const std::string& str); // Infer the type that the string represents // Returns minimal type (eg. '123' returns uint8 and '1234' returns uint16) -// Used for sniffing // cast string to numerical template diff --git a/src/parser/transform/transform_expression.cpp b/src/parser/transform/transform_expression.cpp index b8bf8a00f12..c77032d3324 100644 --- a/src/parser/transform/transform_expression.cpp +++ b/src/parser/transform/transform_expression.cpp @@ -642,10 +642,19 @@ std::unique_ptr Transformer::transformIntegerLiteral( std::unique_ptr Transformer::transformDoubleLiteral( CypherParser::OC_DoubleLiteralContext& ctx) { auto text = ctx.RegularDecimalReal()->getText(); - ku_string_t literal{text.c_str(), text.length()}; - double result; - function::CastString::operation(literal, result); - return std::make_unique(Value(result), ctx.getText()); + auto type = inferMinimalTypeFromString(text); + if (type.getLogicalTypeID() == LogicalTypeID::DECIMAL) { + int128_t val; + decimalCast(text.c_str(), text.length(), val, type); + decimal_t result(val, DecimalType::getPrecision(type), + DecimalType::getScale(type)); + return std::make_unique(Value(result), ctx.getText()); + } else { + ku_string_t literal{text.c_str(), text.length()}; + double result; + function::CastString::operation(literal, result); + return std::make_unique(Value(result), ctx.getText()); + } } } // namespace parser From ea45e255be64d40e69403d7b4e90d89757965f23 Mon Sep 17 00:00:00 2001 From: CI Bot Date: Fri, 26 Jul 2024 18:47:07 +0000 Subject: [PATCH 4/5] Run clang-format --- src/include/common/types/decimal_t.h | 8 ++++---- src/parser/transform/transform_expression.cpp | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/include/common/types/decimal_t.h b/src/include/common/types/decimal_t.h index c011075f32a..2f0aee2a83d 100644 --- a/src/include/common/types/decimal_t.h +++ b/src/include/common/types/decimal_t.h @@ -12,9 +12,9 @@ struct KUZU_API decimal_t { uint32_t scale = 3; decimal_t() {} - decimal_t(int128_t val, uint32_t prec, uint32_t scale): - val(val), precision(prec), scale(scale) {} + decimal_t(int128_t val, uint32_t prec, uint32_t scale) + : val(val), precision(prec), scale(scale) {} }; -} -} +} // namespace common +} // namespace kuzu diff --git a/src/parser/transform/transform_expression.cpp b/src/parser/transform/transform_expression.cpp index c77032d3324..945aab186e0 100644 --- a/src/parser/transform/transform_expression.cpp +++ b/src/parser/transform/transform_expression.cpp @@ -646,8 +646,7 @@ std::unique_ptr Transformer::transformDoubleLiteral( if (type.getLogicalTypeID() == LogicalTypeID::DECIMAL) { int128_t val; decimalCast(text.c_str(), text.length(), val, type); - decimal_t result(val, DecimalType::getPrecision(type), - DecimalType::getScale(type)); + decimal_t result(val, DecimalType::getPrecision(type), DecimalType::getScale(type)); return std::make_unique(Value(result), ctx.getText()); } else { ku_string_t literal{text.c_str(), text.length()}; From 50512c0d267d5be851270473fe64ad25a1e71227 Mon Sep 17 00:00:00 2001 From: mxwli Date: Mon, 29 Jul 2024 14:19:26 -0400 Subject: [PATCH 5/5] progress --- .../cast_string_non_nested_functions.cpp | 141 ------------------ .../cast_string_non_nested_functions.h | 4 - src/parser/transform/transform_expression.cpp | 11 +- 3 files changed, 10 insertions(+), 146 deletions(-) diff --git a/src/function/cast_string_non_nested_functions.cpp b/src/function/cast_string_non_nested_functions.cpp index 284ec6302cc..5ec298a8bb8 100644 --- a/src/function/cast_string_non_nested_functions.cpp +++ b/src/function/cast_string_non_nested_functions.cpp @@ -1,10 +1,5 @@ #include "function/cast/functions/cast_string_non_nested_functions.h" -#include "common/constants.h" -#include "common/types/timestamp_t.h" -#include "function/cast/functions/numeric_limits.h" -#include "re2.h" - namespace kuzu { namespace function { @@ -88,141 +83,5 @@ bool TryCastStringToTimestamp::tryCast(const char* input, uint6 return true; } -static bool isDate(const std::string& str) { - return RE2::FullMatch(str, "\\d{4}/\\d{1,2}/\\d{1,2}") || - RE2::FullMatch(str, "\\d{4}-\\d{1,2}-\\d{1,2}") || - RE2::FullMatch(str, "\\d{4} \\d{1,2} \\d{1,2}") || - RE2::FullMatch(str, "\\d{4}\\\\d{1,2}\\\\d{1,2}"); -} - -LogicalType inferMinimalTypeFromString(const std::string& str) { - constexpr char array_begin = common::CopyConstants::DEFAULT_CSV_LIST_BEGIN_CHAR; - constexpr char array_end = common::CopyConstants::DEFAULT_CSV_LIST_END_CHAR; - auto cpy = StringUtils::ltrim(StringUtils::rtrim(str)); - StringUtils::toUpper(cpy); - if (cpy.size() == 0) { - return LogicalType::STRING(); - } - // Boolean - if (cpy == "TRUE" || cpy == "FALSE") { - return LogicalType::BOOL(); - } - // Unsigned number - if (RE2::FullMatch(cpy, "(0|[1-9]\\d*)")) { - if (cpy.size() >= NumericLimits::digits()) { - return LogicalType::DOUBLE(); - } - int128_t val; - if (!trySimpleInt128Cast(cpy.c_str(), cpy.length(), val)) { - return LogicalType::STRING(); - } - if (val <= NumericLimits::maximum()) { - return LogicalType::UINT8(); - } - if (val <= NumericLimits::maximum()) { - return LogicalType::UINT16(); - } - if (val <= NumericLimits::maximum()) { - return LogicalType::UINT32(); - } - if (val <= NumericLimits::maximum()) { - return LogicalType::UINT64(); - } - return LogicalType::INT128(); - } - // Signed number - if (RE2::FullMatch(cpy, "-(0|[1-9]\\d*)")) { - if (cpy.size() >= 1 + NumericLimits::digits()) { - return LogicalType::DOUBLE(); - } - int128_t val; - if (!trySimpleInt128Cast(cpy.c_str(), cpy.length(), val)) { - return LogicalType::STRING(); - } - if (val >= NumericLimits::minimum()) { - return LogicalType::INT8(); - } - if (val >= NumericLimits::minimum()) { - return LogicalType::INT16(); - } - if (val >= NumericLimits::minimum()) { - return LogicalType::INT32(); - } - if (val >= NumericLimits::minimum()) { - return LogicalType::INT64(); - } - return LogicalType::INT128(); - } - // Real value checking - if (RE2::FullMatch(cpy, "-?(0|[1-9]\\d*)?\\.\\d*")) { - if (cpy[0] == '-') { - cpy.erase(cpy.begin()); - } - if (cpy.size() <= DECIMAL_PRECISION_LIMIT) { - auto decimalPoint = cpy.find('.'); - KU_ASSERT(decimalPoint != std::string::npos); - return LogicalType::DECIMAL(cpy.size() - 1, cpy.size() - decimalPoint - 1); - } else { - return LogicalType::DOUBLE(); - } - } - // date - if (isDate(cpy)) { - return LogicalType::DATE(); - } - // it might just be quicker to try cast to timestamp - timestamp_t tmp; - if (common::Timestamp::tryConvertTimestamp(cpy.c_str(), cpy.length(), tmp)) { - return LogicalType::TIMESTAMP(); - } - - if (cpy.front() == array_begin && cpy.back() == array_end) { - auto split = StringUtils::split(cpy.substr(1, cpy.size() - 2), ",", false); - auto childType = LogicalType::STRING(); - for (auto& ele : split) { - LogicalType combinedType; - if (!LogicalTypeUtils::tryGetMaxLogicalType(childType, inferMinimalTypeFromString(ele), - combinedType)) { - childType = LogicalType::STRING(); - break; - } - childType = std::move(combinedType); - } - return LogicalType::LIST(std::move(childType)); - } - - if (cpy.front() == '{' && cpy.back() == '}') { - auto split = StringUtils::split(cpy.substr(1, cpy.size() - 2), ",", false); - auto childKeyType = LogicalType::STRING(); - bool cannotResolveKey = false; - auto childValueType = LogicalType::STRING(); - bool cannotResolveValue = false; - for (auto& ele : split) { - LogicalType combinedKey = LogicalType::STRING(), combinedValue = LogicalType::STRING(); - auto splitEle = StringUtils::split(ele, "=", false); - if (splitEle.size() != 2) { - // invalid map; give string - return LogicalType::STRING(); - } - if (!cannotResolveKey && !LogicalTypeUtils::tryGetMaxLogicalType(childKeyType, - inferMinimalTypeFromString(splitEle[0]), combinedKey)) { - cannotResolveKey = true; - } - if (!cannotResolveValue && - !LogicalTypeUtils::tryGetMaxLogicalType(childValueType, - inferMinimalTypeFromString(splitEle[1]), combinedValue)) { - cannotResolveValue = true; - } - childKeyType = std::move(combinedKey); - childValueType = std::move(combinedValue); - if (cannotResolveKey && cannotResolveValue) { - break; - } - } - return LogicalType::MAP(std::move(childKeyType), std::move(childValueType)); - } - return LogicalType::STRING(); -} - } // namespace function } // namespace kuzu diff --git a/src/include/function/cast/functions/cast_string_non_nested_functions.h b/src/include/function/cast/functions/cast_string_non_nested_functions.h index d19b2d9a1d2..c308b5e4b5d 100644 --- a/src/include/function/cast/functions/cast_string_non_nested_functions.h +++ b/src/include/function/cast/functions/cast_string_non_nested_functions.h @@ -15,10 +15,6 @@ using namespace kuzu::common; namespace kuzu { namespace function { -LogicalType inferMinimalTypeFromString(const std::string& str); -// Infer the type that the string represents -// Returns minimal type (eg. '123' returns uint8 and '1234' returns uint16) - // cast string to numerical template struct IntegerCastData { diff --git a/src/parser/transform/transform_expression.cpp b/src/parser/transform/transform_expression.cpp index 945aab186e0..cfe21179a53 100644 --- a/src/parser/transform/transform_expression.cpp +++ b/src/parser/transform/transform_expression.cpp @@ -642,7 +642,16 @@ std::unique_ptr Transformer::transformIntegerLiteral( std::unique_ptr Transformer::transformDoubleLiteral( CypherParser::OC_DoubleLiteralContext& ctx) { auto text = ctx.RegularDecimalReal()->getText(); - auto type = inferMinimalTypeFromString(text); + if (text[0] == '-') { + text.erase(text.begin()); + } + auto type = LogicalType::DOUBLE(); + if (text.size() - 1 <= DECIMAL_PRECISION_LIMIT) { + auto decimalPoint = text.find('.'); + KU_ASSERT(decimalPoint != std::string::npos); + type = LogicalType::DECIMAL(text.size() - 1, text.size() - decimalPoint - 1); + } + text = ctx.RegularDecimalReal()->getText(); // undo changes if (type.getLogicalTypeID() == LogicalTypeID::DECIMAL) { int128_t val; decimalCast(text.c_str(), text.length(), val, type);