facebookincubator · simoneves · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -489,6 +489,9 @@ endif()
 
 if(VELOX_ENABLE_WAVE OR VELOX_ENABLE_CUDF)
   enable_language(CUDA)
+  # Use same C++ standard throughout
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  set(CMAKE_CUDA_STANDARD_REQUIRED ${CMAKE_CXX_STANDARD_REQUIRED})
   # Determine CUDA_ARCHITECTURES automatically.
   cmake_policy(SET CMP0104 NEW)
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)

@@ -18,7 +18,7 @@ add_library(
   AstExpression.cpp
   CommonFunctions.cpp
   DecimalExpressionKernels.cpp
-  DecimalExpressionKernels.cu
+  DecimalExpressionKernelsGpu.cu
   ExpressionEvaluator.cpp
   JitExpression.cpp
   PrestoFunctions.cpp
@@ -36,5 +36,3 @@ target_link_libraries(
 )
 
 target_compile_options(velox_cudf_expression PRIVATE -Wno-missing-field-initializers)
-
-set_target_properties(velox_cudf_expression PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)
@@ -15,14 +15,51 @@
  */
 #include "velox/experimental/cudf/expression/AstPrinter.h"
 #include "velox/experimental/cudf/expression/DecimalExpressionKernels.h"
+#include "velox/experimental/cudf/expression/DecimalExpressionKernelsGpu.h"
 
 #include "velox/common/base/Exceptions.h"
+#include "velox/type/DecimalUtil.h"
+#include "velox/type/Type.h"
 
 #include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table_view.hpp>
 
 namespace facebook::velox::cudf_velox {
+namespace {
+
+__int128_t getDecimalScalarValue(
+    const cudf::scalar& s,
+    rmm::cuda_stream_view stream) {
+  if (s.type().id() == cudf::type_id::DECIMAL64) {
+    auto const& dec =
+        static_cast<cudf::fixed_point_scalar<numeric::decimal64> const&>(s);
+    return static_cast<__int128_t>(static_cast<int64_t>(dec.value(stream)));
+  }
+  auto const& dec =
+      static_cast<cudf::fixed_point_scalar<numeric::decimal128> const&>(s);
+  return static_cast<__int128_t>(dec.value(stream));
+}
+
+/// Column of \p outputType with \p size rows, all null (e.g. NULL scalar
+/// operand).
+std::unique_ptr<cudf::column> makeAllNullDecimalColumn(
+    cudf::data_type outputType,
+    cudf::size_type size,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  if (size == 0) {
+    return cudf::make_empty_column(outputType);
+  }
+  return cudf::make_fixed_width_column(
+      outputType, size, cudf::mask_state::ALL_NULL, stream, mr);
+}
+
+} // namespace
 
 // Scatters null values to positions where the divisor is zero.
 // Returns a new column with nulls at zero-divisor positions.
@@ -68,4 +105,179 @@ std::unique_ptr<cudf::column> scatterNullsAtZeroDivisor(
       *nullScalar, *result, divisorIsZero->view(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> decimalDivide(
+    const cudf::column_view& lhs,
+    const cudf::column_view& rhs,
+    cudf::data_type outputType,
+    int32_t aRescale,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  VELOX_CHECK(lhs.size() == rhs.size(), "Decimal divide requires equal sizes");
+  // Use VELOX_CHECK (not _EQ) so failed checks do not pass cudf::type_id into
+  // fmt, which has no formatter for that enum.
+  VELOX_CHECK(
+      lhs.type().id() == rhs.type().id(),
+      "Decimal divide requires matching input types");
+  VELOX_CHECK_GE(
+      aRescale, 0, "Decimal divide requires non-negative rescale factor");
+  // Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
+  // init.
+  VELOX_USER_CHECK_LE(
+      aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");
+
+  const auto inType = lhs.type().id();
+  const auto outType = outputType.id();
+  VELOX_CHECK(
+      inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
+      "Unsupported input type for decimal divide");
+  if (inType == cudf::type_id::DECIMAL64) {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL64 ||
+            outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  } else {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  }
+
+  // Combine input null masks (lhs and rhs nulls).
+  auto [nullMask, nullCount] =
+      cudf::bitmask_and(cudf::table_view({lhs, rhs}), stream, mr);
+
+  // Create output column with input null mask and perform division.
+  auto out = cudf::make_fixed_width_column(
+      outputType, lhs.size(), std::move(nullMask), nullCount, stream, mr);
+
+  const __int128_t rescaleFactor = DecimalUtil::kPowersOfTen[aRescale];
+  VELOX_USER_CHECK(
+      detail::decimalDivideColumnColumn(
+          inType,
+          outType,
+          lhs,
+          rhs,
+          out->mutable_view(),
+          rescaleFactor,
+          stream),
+      "Decimal overflow");
+
+  // Scatter nulls where divisor is zero.
+  return scatterNullsAtZeroDivisor(std::move(out), rhs, stream, mr);
+}
+
+std::unique_ptr<cudf::column> decimalDivide(
+    const cudf::column_view& lhs,
+    const cudf::scalar& rhs,
+    cudf::data_type outputType,
+    int32_t aRescale,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  VELOX_CHECK_GE(
+      aRescale, 0, "Decimal divide requires non-negative rescale factor");
+  // Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
+  // init.
+  VELOX_USER_CHECK_LE(
+      aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");
+
+  if (!rhs.is_valid(stream)) {
+    return makeAllNullDecimalColumn(outputType, lhs.size(), stream, mr);
+  }
+
+  auto nullMask = cudf::copy_bitmask(lhs, stream, mr);
+  auto nullCount = lhs.null_count();
+  auto out = cudf::make_fixed_width_column(
+      outputType, lhs.size(), std::move(nullMask), nullCount, stream, mr);
+
+  auto rhsValue = getDecimalScalarValue(rhs, stream);
+
+  const auto inType = lhs.type().id();
+  const auto outType = outputType.id();
+  VELOX_CHECK(
+      inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
+      "Unsupported input type for decimal divide");
+  if (inType == cudf::type_id::DECIMAL64) {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL64 ||
+            outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  } else {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  }
+
+  VELOX_USER_CHECK(
+      detail::decimalDivideColumnScalar(
+          inType,
+          outType,
+          lhs,
+          rhsValue,
+          out->mutable_view(),
+          DecimalUtil::kPowersOfTen[aRescale],
+          stream),
+      "Decimal overflow");
+
+  return out;
+}
+
+std::unique_ptr<cudf::column> decimalDivide(
+    const cudf::scalar& lhs,
+    const cudf::column_view& rhs,
+    cudf::data_type outputType,
+    int32_t aRescale,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  VELOX_CHECK_GE(
+      aRescale, 0, "Decimal divide requires non-negative rescale factor");
+  // Rescale indexes DecimalUtil::kPowersOfTen; same bound as Presto divide
+  // init.
+  VELOX_USER_CHECK_LE(
+      aRescale, LongDecimalType::kMaxPrecision, "Decimal overflow");
+
+  if (!lhs.is_valid(stream)) {
+    return makeAllNullDecimalColumn(outputType, rhs.size(), stream, mr);
+  }
+
+  // Copy rhs null mask.
+  auto nullMask = cudf::copy_bitmask(rhs, stream, mr);
+  auto nullCount = rhs.null_count();
+
+  // Create output column and perform division.
+  auto out = cudf::make_fixed_width_column(
+      outputType, rhs.size(), std::move(nullMask), nullCount, stream, mr);
+
+  auto lhsValue = getDecimalScalarValue(lhs, stream);
+
+  const auto inType = rhs.type().id();
+  const auto outType = outputType.id();
+  VELOX_CHECK(
+      inType == cudf::type_id::DECIMAL64 || inType == cudf::type_id::DECIMAL128,
+      "Unsupported input type for decimal divide");
+  if (inType == cudf::type_id::DECIMAL64) {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL64 ||
+            outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  } else {
+    VELOX_CHECK(
+        outType == cudf::type_id::DECIMAL128,
+        "Unexpected output type for decimal divide");
+  }
+
+  const __int128_t rescaleFactor = DecimalUtil::kPowersOfTen[aRescale];
+  VELOX_USER_CHECK(
+      detail::decimalDivideScalarColumn(
+          inType,
+          outType,
+          lhsValue,
+          rhs,
+          out->mutable_view(),
+          rescaleFactor,
+          stream),
+      "Decimal overflow");
+
+  // Scatter nulls where divisor is zero.
+  return scatterNullsAtZeroDivisor(std::move(out), rhs, stream, mr);
+}
+
 } // namespace facebook::velox::cudf_velox