Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,9 @@ endif()

if(VELOX_ENABLE_WAVE OR VELOX_ENABLE_CUDF)
enable_language(CUDA)
# Use same C++ standard throughout
set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ${CMAKE_CXX_STANDARD_REQUIRED})

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this was deemed necessary. It might have been a leftover from Part 2, although Part 2 obviously works just fine without it. Should it still be moved here (from the CMakeLists.txt in cudf/expression)? Will it break Wave, or at least offend those devs?

# Determine CUDA_ARCHITECTURES automatically.
cmake_policy(SET CMP0104 NEW)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
Expand Down
4 changes: 1 addition & 3 deletions velox/experimental/cudf/expression/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ add_library(
AstExpression.cpp
CommonFunctions.cpp
DecimalExpressionKernels.cpp
DecimalExpressionKernels.cu
DecimalExpressionKernelsGpu.cu
ExpressionEvaluator.cpp
JitExpression.cpp
PrestoFunctions.cpp
Expand All @@ -36,5 +36,3 @@ target_link_libraries(
)

target_compile_options(velox_cudf_expression PRIVATE -Wno-missing-field-initializers)

set_target_properties(velox_cudf_expression PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved to root CMakeLists.txt (possible contentious)

212 changes: 212 additions & 0 deletions velox/experimental/cudf/expression/DecimalExpressionKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,51 @@
*/
#include "velox/experimental/cudf/expression/AstPrinter.h"
#include "velox/experimental/cudf/expression/DecimalExpressionKernels.h"
#include "velox/experimental/cudf/expression/DecimalExpressionKernelsGpu.h"

#include "velox/common/base/Exceptions.h"
#include "velox/type/DecimalUtil.h"
#include "velox/type/Type.h"

#include <cudf/binaryop.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table_view.hpp>

namespace facebook::velox::cudf_velox {
namespace {

__int128_t getDecimalScalarValue(
const cudf::scalar& s,
rmm::cuda_stream_view stream) {
if (s.type().id() == cudf::type_id::DECIMAL64) {
auto const& dec =
static_cast<cudf::fixed_point_scalar<numeric::decimal64> const&>(s);
return static_cast<__int128_t>(static_cast<int64_t>(dec.value(stream)));
}
auto const& dec =
static_cast<cudf::fixed_point_scalar<numeric::decimal128> const&>(s);
return static_cast<__int128_t>(dec.value(stream));
}

/// Column of \p outputType with \p size rows, all null (e.g. NULL scalar
/// operand).
std::unique_ptr<cudf::column> makeAllNullDecimalColumn(
cudf::data_type outputType,
cudf::size_type size,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) {
if (size == 0) {
return cudf::make_empty_column(outputType);
}
return cudf::make_fixed_width_column(
outputType, size, cudf::mask_state::ALL_NULL, stream, mr);
}

} // namespace

// Scatters null values to positions where the divisor is zero.
// Returns a new column with nulls at zero-divisor positions.
Expand Down Expand Up @@ -68,4 +105,179 @@ std::unique_ptr<cudf::column> scatterNullsAtZeroDivisor(
*nullScalar, *result, divisorIsZero->view(), stream, mr);
}

std::unique_ptr<cudf::column> decimalDivide(
const cudf::column_view& lhs,
const cudf::column_view& rhs,
cudf::data_type outputType,
int32_t aRescale,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) {
VELOX_CHECK(lhs.size() == rhs.size(), "Decimal divide requires equal sizes");
// Use VELOX_CHECK (not _EQ) so failed checks do not pass cudf::type_id into
// fmt, which has no formatter for that enum.
VELOX_CHECK(
lhs.type().id() == rhs.type().id(),
"Decimal divide requires matching input types");
VELOX_CHECK_GE(
aRescale, 0, "Decimal divide requires non-negative rescale factor");
// Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
// init.
VELOX_USER_CHECK_LE(
aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");

const auto inType = lhs.type().id();
const auto outType = outputType.id();
VELOX_CHECK(
inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
"Unsupported input type for decimal divide");
if (inType == cudf::type_id::DECIMAL64) {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL64 ||
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
} else {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
}

// Combine input null masks (lhs and rhs nulls).
auto [nullMask, nullCount] =
cudf::bitmask_and(cudf::table_view({lhs, rhs}), stream, mr);

// Create output column with input null mask and perform division.
auto out = cudf::make_fixed_width_column(
outputType, lhs.size(), std::move(nullMask), nullCount, stream, mr);

const __int128_t rescaleFactor = DecimalUtil::kPowersOfTen[aRescale];
VELOX_USER_CHECK(
detail::decimalDivideColumnColumn(
inType,
outType,
lhs,
rhs,
out->mutable_view(),
rescaleFactor,
stream),
"Decimal overflow");

// Scatter nulls where divisor is zero.
return scatterNullsAtZeroDivisor(std::move(out), rhs, stream, mr);
}

std::unique_ptr<cudf::column> decimalDivide(
const cudf::column_view& lhs,
const cudf::scalar& rhs,
cudf::data_type outputType,
int32_t aRescale,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) {
VELOX_CHECK_GE(
aRescale, 0, "Decimal divide requires non-negative rescale factor");
// Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
// init.
VELOX_USER_CHECK_LE(
aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");

if (!rhs.is_valid(stream)) {
return makeAllNullDecimalColumn(outputType, lhs.size(), stream, mr);
}

auto nullMask = cudf::copy_bitmask(lhs, stream, mr);
auto nullCount = lhs.null_count();
auto out = cudf::make_fixed_width_column(
outputType, lhs.size(), std::move(nullMask), nullCount, stream, mr);

auto rhsValue = getDecimalScalarValue(rhs, stream);

const auto inType = lhs.type().id();
const auto outType = outputType.id();
VELOX_CHECK(
inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
"Unsupported input type for decimal divide");
if (inType == cudf::type_id::DECIMAL64) {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL64 ||
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
} else {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
}

VELOX_USER_CHECK(
detail::decimalDivideColumnScalar(
inType,
outType,
lhs,
rhsValue,
out->mutable_view(),
DecimalUtil::kPowersOfTen[aRescale],
stream),
"Decimal overflow");

return out;
}

std::unique_ptr<cudf::column> decimalDivide(
const cudf::scalar& lhs,
const cudf::column_view& rhs,
cudf::data_type outputType,
int32_t aRescale,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) {
VELOX_CHECK_GE(
aRescale, 0, "Decimal divide requires non-negative rescale factor");
// Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
// init.
VELOX_USER_CHECK_LE(
aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");

if (!lhs.is_valid(stream)) {
return makeAllNullDecimalColumn(outputType, rhs.size(), stream, mr);
}

// Copy rhs null mask.
auto nullMask = cudf::copy_bitmask(rhs, stream, mr);
auto nullCount = rhs.null_count();

// Create output column and perform division.
auto out = cudf::make_fixed_width_column(
outputType, rhs.size(), std::move(nullMask), nullCount, stream, mr);

auto lhsValue = getDecimalScalarValue(lhs, stream);

const auto inType = rhs.type().id();
const auto outType = outputType.id();
VELOX_CHECK(
inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
"Unsupported input type for decimal divide");
if (inType == cudf::type_id::DECIMAL64) {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL64 ||
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
} else {
VELOX_CHECK(
outType == cudf::type_id::DECIMAL128,
"Unexpected output type for decimal divide");
}

const __int128_t rescaleFactor = DecimalUtil::kPowersOfTen[aRescale];
VELOX_USER_CHECK(
detail::decimalDivideScalarColumn(
inType,
outType,
lhsValue,
rhs,
out->mutable_view(),
rescaleFactor,
stream),
"Decimal overflow");

// Scatter nulls where divisor is zero.
return scatterNullsAtZeroDivisor(std::move(out), rhs, stream, mr);
}

} // namespace facebook::velox::cudf_velox
Loading
Loading