diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index f15492a2e11..efc79bcd14d 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -71,6 +71,7 @@ if(VELOX_ENABLE_WAVE OR VELOX_ENABLE_CUDF) add_subdirectory(experimental/gpu) if(${VELOX_ENABLE_CUDF}) add_subdirectory(experimental/cudf) + add_subdirectory(cudf-binop-scalar-repro) endif() if(${VELOX_ENABLE_WAVE}) add_subdirectory(experimental/wave) diff --git a/velox/cudf-binop-scalar-repro/CMakeLists.txt b/velox/cudf-binop-scalar-repro/CMakeLists.txt new file mode 100644 index 00000000000..100645dfda5 --- /dev/null +++ b/velox/cudf-binop-scalar-repro/CMakeLists.txt @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# cuDF requires CMake >= 3.30.4; Velox currently allows 3.28+ at the root. +cmake_minimum_required(VERSION 3.30.4) + +# Standalone: cmake -S . -B build -DCMAKE_PREFIX_PATH=/opt/velox-build/release +# In-tree: cmake --build --target cudf_nullmax_scalar_repro +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + project(cudf_binop_scalar_repro LANGUAGES CXX CUDA) + + set(CMAKE_CXX_STANDARD 20) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + cmake_policy(SET CMP0104 NEW) + + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + message( + FATAL_ERROR + "Standalone configure requires -DCMAKE_CUDA_ARCHITECTURES=... (e.g. 80)" + ) + endif() + + find_package(CUDAToolkit REQUIRED) + find_package(cudf REQUIRED) +endif() + +if(NOT TARGET cudf::cudf) + message( + FATAL_ERROR + "cudf::cudf not found. Build from the Velox tree with VELOX_ENABLE_CUDF=ON, " + "or pass -DCMAKE_PREFIX_PATH= for a standalone configure." + ) +endif() + +add_executable(cudf_nullmax_scalar_repro main.cpp) + +set_target_properties( + cudf_nullmax_scalar_repro + PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON +) + +target_link_libraries( + cudf_nullmax_scalar_repro + PRIVATE cudf::cudf CUDA::cudart +) + +target_compile_options( + cudf_nullmax_scalar_repro + PRIVATE -Wno-missing-field-initializers +) diff --git a/velox/cudf-binop-scalar-repro/PLAN-CUDF-UPSTREAM-FIX.md b/velox/cudf-binop-scalar-repro/PLAN-CUDF-UPSTREAM-FIX.md new file mode 100644 index 00000000000..535bee4402a --- /dev/null +++ b/velox/cudf-binop-scalar-repro/PLAN-CUDF-UPSTREAM-FIX.md @@ -0,0 +1,224 @@ +# cuDF upstream fix: scalar validity vs column bitmask in compiled binops + +**Status (2026-06):** Root cause identified; minimal repro in Velox tree; Velox workaround implemented; upstream patch not yet filed. + +## Summary + +There is a **real cuDF bug** in the compiled binary-operation path when one operand is a **scalar** and the operator is **null-aware**. Scalar validity is stored as a single device `bool`; the scalar-as-column wrapper exposes it as a column null **bitmask**, and kernels call `bit_is_set`, which reads 32 bits from a 1-byte allocation. + +This is **not** a Velox scalar construction bug. cuDF’s own column-vs-scalar tests for `ADD`/`DIV`/`MOD`/comparisons pass because those kernels **do not** call `is_valid()` on operands inside the device loop. + +Velox ships a workaround (scalar broadcast) — see `PLAN-VELOX-WORKAROUND.md`. + +--- + +## Root cause + +### 1. Scalar validity representation + +`cudf::scalar` stores validity as: + +```cpp +// cpp/include/cudf/scalar/scalar.hpp +cudf::detail::device_scalar _is_valid; +``` + +`validity_data()` returns `bool*` (1 byte on device). + +### 2. Scalar wrapped as column_view + +`scalar_as_column_view` in `cpp/src/binaryop/compiled/binary_ops.cu`: + +```cpp +auto col_v = column_view(s.type(), + 1, + h_scalar_type_view.data(), + reinterpret_cast(s.validity_data()), + !s.is_valid(stream)); +``` + +The `reinterpret_cast +Access at ... is out of bounds +and is inside the nearest allocation at ... of size 1 bytes +``` + +Matches: 4-byte `bit_is_set` read from 1-byte scalar validity buffer. + +### 5. Entry point sets scalar flag correctly + +Vector-scalar path in `binary_ops.cu` is correct **except** for the validity representation: + +```cpp +auto [rhsv, aux] = scalar_to_column_view(rhs, stream); +operator_dispatcher(out, lhs, rhsv, false, true, op, stream); // is_rhs_scalar = true +``` + +The bug is **not** a missing `is_rhs_scalar` flag; it is **invalid null-mask plumbing**. + +--- + +## Affected vs unaffected ops + +| Operator class | Reads operand validity in kernel? | Column-vs-scalar fixed_point | +|----------------|-----------------------------------|------------------------------| +| `NULL_MAX`, `NULL_MIN` | Yes | **Broken** | +| `NULL_EQUALS`, `NULL_NOT_EQUALS` | Yes | **Broken** | +| `NULL_LOGICAL_AND`, `NULL_LOGICAL_OR` | Yes | **Broken** (if used) | +| `ADD`, `SUB`, `MUL`, `MOD`, `DIV`, … | No | OK (cuDF tests, 1000 rows) | +| `EQUAL`, `NOT_EQUAL`, `LESS`, … | No (non-null-aware `ops::Equal`) | OK | +| `GREATER_EQUAL`, `LESS_EQUAL`, … | No | OK | + +### Test gap in cuDF + +`cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp` has: + +- Column/column `NULL_MAX` / `NULL_MIN` +- Column/scalar `ADD`, `DIV`, `MOD`, comparisons (multi-row) + +It does **not** appear to have column/scalar `NULL_MAX` / `NULL_MIN` on fixed_point with **N > 1** rows — the exact Velox `greatest(a, literal, b)` pattern. + +--- + +## Proposed cuDF fixes (pick one) + +### Option A — Fix `scalar_as_column_view` (recommended) + +Build a **proper 1-element null mask** instead of casting `bool*`: + +```cpp +// Pseudocode +if (s.is_valid(stream)) { + // null_mask = nullptr, null_count = 0 → nullable() == false, is_valid() short-circuits true +} else { + // allocate bitmask with bit 0 clear, null_count = 1 +} +``` + +For **invalid** scalars, must satisfy `column_view` invariants (`null_count > 0` requires non-null `null_mask`). + +Simplest robust approach: **`make_column_from_scalar(s, 1, stream, mr)`** and use the returned column’s view (same strategy as the Velox workaround). Slight allocation cost, correct semantics. + +### Option B — Branch in `ops_wrapper` on `is_*_scalar` + +When `is_rhs_scalar`, read validity via `scalar_device_view` / single `bool` load instead of `column_device_view::is_valid()`. + +Requires passing scalar device views into the kernel or a side channel — larger API change. + +### Option C — Host-only validity for scalars + +For null-aware ops with one scalar operand, compute output null mask on host (similar to non-null-aware ops today) and use a kernel variant that skips validity reads. + +More invasive; duplicates logic. + +**Recommendation:** Option A in `scalar_as_column_view` — localized, matches existing `make_column_from_scalar` behavior, fixes all null-aware ops at once. + +--- + +## Minimal reproduction (Velox tree) + +Location: + +```text +velox/cudf-binop-scalar-repro/ +├── main.cpp +├── CMakeLists.txt +├── README.md +├── PLAN-VELOX-WORKAROUND.md +└── PLAN-CUDF-UPSTREAM-FIX.md (this file) +``` + +### Build in-tree + +```bash +cmake --build /opt/velox-build/release --target cudf_nullmax_scalar_repro -j +``` + +### Build standalone + +```bash +cd velox/cudf-binop-scalar-repro +cmake -S . -B build \ + -DCMAKE_PREFIX_PATH=/opt/velox-build/release \ + -DCMAKE_CUDA_ARCHITECTURES=80 +cmake --build build -j +``` + +### Run sanitizer + +```bash +compute-sanitizer --tool memcheck ./build/cudf_nullmax_scalar_repro +compute-sanitizer --tool memcheck ./build/cudf_nullmax_scalar_repro --only null_max +``` + +### Expected results + +| `--only` | Op | Sanitizer | +|----------|-----|-----------| +| `null_max` | `NULL_MAX` | **OOB** (bug) | +| `null_min` | `NULL_MIN` | **OOB** | +| `null_equals` | `NULL_EQUALS` | **OOB** | +| `equal` | `EQUAL` | Clean (control) | +| `add` | `ADD` | Clean (control) | + +--- + +## Suggested upstream issue / PR outline + +**Title:** `Compiled binops: OOB read when null-aware ops use column-vs-scalar fixed_point` + +**Body:** + +1. Link to repro (`velox/cudf-binop-scalar-repro/`) or inline gtest added to `binop-compiled-fixed_point-test.cpp`. +2. Explain `bool*` vs `bitmask_type*` in `scalar_as_column_view`. +3. Stack: `apply_binary_op` → `ops_wrapper` → `is_valid(0)` → `bit_is_set`. +4. Proposed fix: Option A above. +5. Add test: + +```cpp +// FixedPointCompiledTest, new case +auto col = fp_wrapper({40, 30, 20}, scale_type{-2}); +auto scalar = cudf::make_fixed_point_scalar(500, scale_type{-2}); +auto result = cudf::binary_operation(col, *scalar, cudf::binary_operator::NULL_MAX, type); +// + compute-sanitizer in CI if available +``` + +**Velox cuDF pin:** `VELOX_cudf_COMMIT d09d10d` (26.06) — verify on current `rapidsai/cudf` main as well. + +**Local cuDF checkout for inspection:** `/home/seves/work/rapidsai/cudf` + +--- + +## Verification after upstream fix + +1. cuDF: new gtest + sanitizer clean on repro cases. +2. Velox: drop scalar broadcast workaround where no longer needed; keep `castDecimalScalar`. +3. Velox: `compute-sanitizer` on `decimalGreatestLeastMixed` and full `velox_cudf_decimal_expression_test`. + +--- + +## Related Velox work + +See `PLAN-VELOX-WORKAROUND.md` for the implemented Velox-side workaround: + +- `CudfBinaryOpUtils.{h,cpp}` — broadcast helpers +- `ExpressionEvaluator.cpp` — `BinaryFunction`, `BetweenFunction`, `GreatestLeastFunction` +- `DecimalExpressionKernels.cpp` — `scatterNullsAtZeroDivisor` diff --git a/velox/cudf-binop-scalar-repro/PLAN-VELOX-WORKAROUND.md b/velox/cudf-binop-scalar-repro/PLAN-VELOX-WORKAROUND.md new file mode 100644 index 00000000000..f67854a1ba5 --- /dev/null +++ b/velox/cudf-binop-scalar-repro/PLAN-VELOX-WORKAROUND.md @@ -0,0 +1,162 @@ +# Velox workaround: cuDF column-vs-scalar fixed_point binary ops + +**Status (2026-06):** Implemented. Stash popped; all changes below are in the working tree. + +## Problem summary + +cuDF’s compiled binary-op path wraps scalars as 1-row `column_view` objects but reinterprets the scalar’s **1-byte device `bool` validity** as a **column bitmask**. Null-aware kernels (`NULL_MAX`, `NULL_MIN`, `NULL_EQUALS`, …) call `column_device_view::is_valid()`, which uses `bit_is_set` and reads a **4-byte word** — causing OOB reads under compute-sanitizer and undefined behavior at runtime. + +Velox originally hit this via `greatest` / `least` with decimal literals (`NULL_MAX` / `NULL_MIN` column-vs-scalar). Stream corruption from those OOB reads could also cause downstream divide tests to fail. + +**Root cause is in cuDF** — see `PLAN-CUDF-UPSTREAM-FIX.md` and `velox/cudf-binop-scalar-repro/`. + +--- + +## Implemented workaround + +Broadcast the scalar to a full column via `cudf::make_column_from_scalar`, then call column-vs-column `cudf::binary_operation`. This avoids the broken scalar-as-column validity path entirely. + +### New files + +| File | Role | +|------|------| +| `velox/experimental/cudf/expression/CudfBinaryOpUtils.h` | Shared API | +| `velox/experimental/cudf/expression/CudfBinaryOpUtils.cpp` | Implementation | + +**API** (`facebook::velox::cudf_velox`): + +- `castDecimalScalar(src, targetType, stream, mr)` — decimal scale/precision alignment (moved from `ExpressionEvaluator.cpp`) +- `binaryOpColumnWithBroadcastScalar(col, scalar, op, outType, stream, mr)` +- `binaryOpScalarWithBroadcastColumn(scalar, col, op, outType, stream, mr)` + +### Build integration + +`CudfBinaryOpUtils.cpp` added to `velox/experimental/cudf/expression/CMakeLists.txt` in the `velox_cudf_expression` library. + +### Call sites updated + +#### 1. `BinaryFunction::eval` (`ExpressionEvaluator.cpp`) + +For **all** `fixed_point` column-vs-scalar paths (except divide, which uses custom kernels): + +| Op category | Path | +|-------------|------| +| Comparisons (`LESS`, `GREATER`, `EQUAL`, …) | Scale-align via `castDecimalScalar` + broadcast | +| `ADD`, `SUB`, `MOD` | Cast column to output type if needed + broadcast | +| `MUL` | DECIMAL128 promotion + broadcast | +| Other fixed_point ops | Broadcast fallback | + +**Unchanged:** + +- Column-vs-column → direct `cudf::binary_operation` +- `DIV` → `decimalDivide` custom kernel (column/column or column/scalar overload) +- Non-decimal types → direct `cudf::binary_operation` +- Divide-by-zero on scalar divisor → `decimalScalarIsZero` + throw before kernel + +#### 2. `BetweenFunction::eval` (`ExpressionEvaluator.cpp`) + +When `minLiteral_` or `maxLiteral_` is present and the value column is `fixed_point`: + +- `GREATER_EQUAL` / `LESS_EQUAL` via `binaryOpColumnWithBroadcastScalar` +- Final `LOGICAL_AND` remains column-vs-column + +#### 3. `GreatestLeastFunction::eval` (`ExpressionEvaluator.cpp`) + +- Column-only reduction loop: column-vs-column (unchanged) +- Final folded literal step: cast lhs/scalar to `type_` if needed, then `binaryOpColumnWithBroadcastScalar` with `NULL_MAX` / `NULL_MIN` + +This is the **primary** sanitizer trigger (`decimalGreatestLeastMixed`). + +#### 4. `scatterNullsAtZeroDivisor` (`DecimalExpressionKernels.cpp`) + +Zero-check uses broadcast for `EQUAL` column-vs-scalar: + +```cpp +auto divisorIsZero = binaryOpColumnWithBroadcastScalar( + divisor, *zeroScalar, cudf::binary_operator::EQUAL, ...); +``` + +`EQUAL` is not null-aware in the cuDF kernel and likely did not need this for correctness, but broadcast avoids any risk and sidesteps stream corruption when null-aware ops ran earlier in the same expression. + +### Repro target + +`velox/cudf-binop-scalar-repro/` — minimal cuDF repro + sanitizer harness. Wired in `velox/CMakeLists.txt` when `VELOX_ENABLE_CUDF=ON`. + +--- + +## Design note: breadth vs narrow fix + +Analysis showed only **null-aware** cuDF ops (`NULL_MAX`, `NULL_MIN`, `NULL_EQUALS`, …) actually read scalar validity in the kernel. `ADD`/`SUB`/`MUL`/`MOD`/comparisons/`EQUAL` are safe on the native column-vs-scalar path. + +The current implementation broadcasts **all** decimal column-vs-scalar binops for simplicity and defensive coverage. A future refinement could: + +1. Restrict broadcast to null-aware ops only (`GreatestLeastFunction` is mandatory). +2. Revert `BinaryFunction` arithmetic/comparison paths to native cuDF once upstream is fixed. +3. Replace `scatterNullsAtZeroDivisor` broadcast with a tiny custom CUDA kernel (`out_null[i] |= (divisor[i] == 0)`). + +--- + +## Verification + +### cuDF repro (confirms upstream bug) + +```bash +cmake --build /opt/velox-build/release --target cudf_nullmax_scalar_repro -j + +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro \ + --only null_max +``` + +Expect OOB on `null_max` / `null_min` / `null_equals`; clean on `equal` / `add`. + +### Velox expression tests (confirms workaround) + +```bash +velox_cudf_decimal_expression_test --gtest_filter='CudfDecimalTest.decimalGreatestLeast*' + +compute-sanitizer --tool memcheck \ + velox_cudf_decimal_expression_test \ + --gtest_filter='CudfDecimalTest.decimalGreatestLeastMixed' +``` + +Additional tests that were failing before the workaround: + +```bash +velox_cudf_decimal_expression_test --gtest_filter='\ +CudfDecimalTest.decimalBinaryNullPropagation:\ +CudfDecimalTest.decimalDivide*:\ +CudfDecimalTest.decimalArithmeticWithScalar*:\ +CudfDecimalBinaryTest.cpuGpuMatch/div_*' +``` + +--- + +## Long-term cleanup (after cuDF fix) + +1. Drop broadcast wrappers for ops that cuDF fixes in `scalar_as_column_view`. +2. Keep `castDecimalScalar` — still needed for Velox decimal scale alignment. +3. Re-run sanitizer on repro + full `velox_cudf_decimal_expression_test`. +4. Optionally narrow broadcast scope per [Design note](#design-note-breadth-vs-narrow-fix). + +--- + +## File inventory (current tree) + +| Artifact | Status | +|----------|--------| +| `CudfBinaryOpUtils.h` | Present | +| `CudfBinaryOpUtils.cpp` | Present, in CMakeLists | +| `ExpressionEvaluator.cpp` | Uses broadcast helpers; no duplicate `castDecimalScalar` | +| `DecimalExpressionKernels.cpp` | `scatterNullsAtZeroDivisor` uses broadcast | +| `cudf-binop-scalar-repro/` | Present, in `velox/CMakeLists.txt` | + +**Velox cuDF pin:** `VELOX_cudf_COMMIT d09d10d` (26.06) — see `CMake/resolve_dependency_modules/cudf.cmake`. + +--- + +## References + +- Sanitizer log: `vcdet.log` (repo root) — `NullMax`, `fixed_point`, 4-byte read, 1-byte allocation +- cuDF bug site: `rapidsai/cudf/cpp/src/binaryop/compiled/binary_ops.cu` (`scalar_as_column_view`) +- Upstream fix plan: `PLAN-CUDF-UPSTREAM-FIX.md` diff --git a/velox/cudf-binop-scalar-repro/README.md b/velox/cudf-binop-scalar-repro/README.md new file mode 100644 index 00000000000..0cb2e2172ed --- /dev/null +++ b/velox/cudf-binop-scalar-repro/README.md @@ -0,0 +1,97 @@ +# cuDF column-vs-scalar fixed_point binary op repro + +Minimal standalone program that triggers the cuDF bug in +`scalar_as_column_view` (`cpp/src/binaryop/compiled/binary_ops.cu`): scalar +validity is a single device `bool`, but null-aware compiled kernels treat it as +a column bitmask and `bit_is_set` reads 4 bytes from a 1-byte allocation. + +This matches the failure seen in Velox `greatest`/`least` with decimal literals +(`NULL_MAX` / `NULL_MIN` column-vs-scalar). + +See also: + +- `PLAN-CUDF-UPSTREAM-FIX.md` — root cause and upstream fix proposal +- `PLAN-VELOX-WORKAROUND.md` — Velox broadcast workaround (implemented) + +## Build + +### In-tree (recommended) + +From an existing Velox release build with `VELOX_ENABLE_CUDF=ON` (reconfigure +if this directory was added after the last `cmake` run): + +```bash +cmake --build /opt/velox-build/release --target cudf_nullmax_scalar_repro -j +``` + +Binary path: + +```text +/opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro +``` + +### Standalone (optional) + +Configure from this directory; point `CMAKE_PREFIX_PATH` at a Velox build tree +that already fetched cuDF: + +```bash +cd velox/cudf-binop-scalar-repro +cmake -S . -B build \ + -DCMAKE_PREFIX_PATH=/opt/velox-build/release \ + -DCMAKE_CUDA_ARCHITECTURES=80 +cmake --build build -j +./build/cudf_nullmax_scalar_repro +``` + +## Run + +```bash +/opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro +``` + +Run a single case: + +```bash +.../cudf_nullmax_scalar_repro --only null_max +``` + +Available `--only` values: `null_max`, `null_min`, `null_equals`, `equal`, `add`. + +## compute-sanitizer + +Null-aware cases should report an invalid global read; controls should be clean: + +```bash +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro + +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro \ + --only null_max +``` + +Expected sanitizer signature (abridged): + +```text +Invalid __global__ read of size 4 bytes +... NullMax ... fixed_point ... +Access ... is out of bounds +and is inside the nearest allocation ... of size 1 bytes +``` + +## Cases + +| Case | Op | Sanitizer | +|------|-----|-----------| +| `null_max` | `NULL_MAX` column vs scalar | OOB (bug) | +| `null_min` | `NULL_MIN` column vs scalar | OOB (bug) | +| `null_equals` | `NULL_EQUALS` column vs scalar | OOB (bug) | +| `equal` | `EQUAL` column vs scalar | Clean (control) | +| `add` | `ADD` column vs scalar | Clean (control) | + +## Upstream fix direction + +In cuDF, avoid casting `scalar::validity_data()` (`bool*`) to `bitmask_type*` +when building the temporary `column_view` for compiled binops; materialize a +real 1-bit null mask or branch on `is_*_scalar` in `ops_wrapper`. diff --git a/velox/cudf-binop-scalar-repro/cursor_contents_of_vcdet_log6.md b/velox/cudf-binop-scalar-repro/cursor_contents_of_vcdet_log6.md new file mode 100644 index 00000000000..14c0270b79a --- /dev/null +++ b/velox/cudf-binop-scalar-repro/cursor_contents_of_vcdet_log6.md @@ -0,0 +1,1232 @@ +# Contents of vcdet.log +_Exported on 6/2/2026 at 14:13:26 PDT from Cursor (3.5.38)_ + +--- + +**User** + +Give me your opinion on the contents of file vcdet.log in this workspace + +--- + +**Cursor** + +Reading `vcdet.log` to understand what it contains. + + +## What `vcdet.log` is + +This is a **NVIDIA Compute Sanitizer** run of `velox_cudf_decimal_expression_test` (60 tests across 2 suites). The filename fits “**V**elox **C**u**D** **E**xpression **T**est” under sanitizer. It is useful diagnostic output, not something you would normally commit (it is untracked in git). + +## High-level verdict + +**This log documents a real GPU memory bug, not a flaky test or sanitizer noise.** The run did not finish cleanly: it crashed mid-suite with **15 sanitizer errors** and `ERROR SUMMARY: 15 errors`. + +| Phase | Outcome | +|--------|---------| +| Tests 1–32 (`CudfDecimalTest`) | All reported `[ OK ]` | +| Test 33: `decimalGreatestLeastMixed` | OOB device read → kernel failure → abort | +| Remaining ~27 tests | Never ran | + +So the sanitizer did its job: it caught something that a normal gtest run might surface only as a vague `cudaErrorLaunchFailure`. + +## The smoking gun + +The first failure happens right when `decimalGreatestLeastMixed` starts (after `decimalGreatestLeastAllColumns` passed): + +``` +Invalid __global__ read of size 4 bytes +... +cudf::binops::compiled::ops::NullMax +numeric::fixed_point // decimal64 +Access at 0x327e000000 is out of bounds +and is inside the nearest allocation at 0x327e000000 of size 1 bytes +``` + +Host stack (trimmed): + +1. `GreatestLeastFunction::eval` in `velox_cudf_decimal_expression_test` +2. `cudf::binary_operation(column, scalar, NULL_MAX)` in libcudf +3. `CudfFilterProject::doGetOutput` → Velox `Driver` + +That lines up with the test, which mixes **columns** `a`, `b` with **literal decimals** folded into a scalar: + +```1437:1458:velox/experimental/cudf/tests/DecimalExpressionTest.cpp +TEST_F(CudfDecimalTest, decimalGreatestLeastMixed) { + ... + .project( + {"greatest(a, CAST('5.00' AS DECIMAL(10, 2)), b) AS g", + "least(a, CAST('0.00' AS DECIMAL(10, 2)), b) AS l"}) +``` + +`GreatestLeastFunction` folds constants on the host, then applies them via `cudf::binary_operation(lhs, *foldedScalar_, op_, ...)`: + +```1033:1039:velox/experimental/cudf/expression/ExpressionEvaluator.cpp + if (foldedScalar_) { + cudf::column_view lhs = + result ? result->view() : asView(inputColumns[order_[0]]); + result = + cudf::binary_operation(lhs, *foldedScalar_, op_, type_, stream, mr); + } +``` + +**Pattern:** column-only `greatest`/`least` passes; column + folded scalar fails. That strongly points at the **(decimal column, decimal scalar, NULL_MAX/NULL_MIN)** path, not generic decimal arithmetic (many earlier tests already passed). + +The **“nearest allocation … size 1 bytes”** detail is especially telling: the kernel is treating something like a full column (4-byte reads, multiple threads) while the backing allocation looks like a **1-byte** buffer—classic scalar vs column metadata mismatch, or a bad validity/null mask size. + +## Cascade (secondary, not root cause) + +After the OOB read you get a predictable chain: + +- `cudaErrorLaunchFailure` on `cudaStreamSynchronize`, `cudaFreeAsync`, `cudaGetLastError` +- Velox: `Operator::getOutput failed for CudfFilterProject` +- `terminate()` → `rmm::cuda_error` during `CudfFromVelox::doClose()` stream sync +- SIGABRT, process did not terminate successfully + +Those later errors are **symptoms** of the bad kernel launch, not separate bugs. + +## Where to look (opinion on blame) + +I would investigate in this order: + +1. **Velox `GreatestLeastFunction`** — how `foldedScalar_` is built for `DECIMAL` and whether `binary_operation(column, scalar)` is called with the right `type_` and column views after partial reduction. +2. **cuDF** — compiled `NullMax`/`NullMin` for `fixed_point` with a scalar RHS (possible upstream bug; worth a minimal libcudf repro). +3. **Not the test assertion logic** — the crash happens during GPU execution of the cuDF path, before `assertEqualVectors` would matter. + +`decimalGreatestLeastWithNulls` (next in the file) likely exercises similar NULL_MAX semantics; it may fail the same way once this is fixed. + +## Practical takeaways + +- **Treat this as blocking** for GPU decimal `greatest`/`least` with literals until fixed; sanitizer is right to flag it. +- **Do not commit** `vcdet.log` (large, environment-specific, no long-term value). +- **Repro command** (conceptually): run `decimalGreatestLeastMixed` under `compute-sanitizer --tool memcheck` on `velox_cudf_decimal_expression_test`. +- One disabled test in the same run (`DISABLED_decimalCompareDifferentScales`) is unrelated; the failure is specifically on the mixed column+constant greatest/least path. + +If you want, I can trace `createCudfScalar` / decimal scalar layout in Agent mode and narrow whether the bug is on the Velox side, the cuDF side, or both. + +--- + +**User** + +Go ahead. Be aware that the build environment is in a container, so I will have to build and run manually for you. + +--- + +**Cursor** + +Tracing the decimal scalar path in `GreatestLeastFunction` and related code to pinpoint the bug. + + +Implementing a fix: cuDF's compiled `NULL_MAX`/`NULL_MIN` column-scalar path OOB-reads for decimals. Broadcasting the scalar to a column (as elsewhere in this codebase) avoids that path. + + +## Root cause + +The crash is in cuDF’s **compiled `NULL_MAX`/`NULL_MIN` column-vs-scalar** path for `fixed_point` (decimal). The sanitizer showed a 4-byte read against a **1-byte allocation** — consistent with the kernel treating a scalar validity buffer like a column null mask across multiple rows. + +Velox’s `GreatestLeastFunction` was calling: + +```cpp +cudf::binary_operation(lhs, *foldedScalar_, op_, type_, stream, mr); +``` + +That path works for plain arithmetic (e.g. `decimalArithmeticWithScalarRight`), but not for `NULL_MAX`/`NULL_MIN` with decimals. Column-only `greatest(a, b, c)` passed; `greatest(a, CAST('5.00' ...), b)` failed because it hits the column+scalar path. + +Scalar creation in `makeScalarFromValue` / `createCudfScalar` looks correct (scale sign, stream sync). The bug is not in how Velox builds the scalar. + +## Fix applied + +In `ExpressionEvaluator.cpp`, added `binaryOpColumnWithBroadcastScalar` which: + +1. Aligns decimal types via `castDecimalScalar` when scales/types differ (same pattern as `BinaryFunction`) +2. Broadcasts the scalar with `make_column_from_scalar(..., lhs.size(), ...)` +3. Calls `binary_operation` column-vs-column + +This matches patterns already used in `StringPatternPredicateFunction`, `CoalesceFunction`, etc. + +## How to verify in your container + +Rebuild the decimal expression test target, then run the failing test under compute-sanitizer: + +```bash +# rebuild (adjust paths to your container build dir) +cmake --build /opt/velox-build/release --target velox_cudf_decimal_expression_test -j + +# quick repro +/opt/velox-build/release/velox/experimental/cudf/tests/velox_cudf_decimal_expression_test \ + --gtest_filter='CudfDecimalTest.decimalGreatestLeastMixed' + +# full sanitizer run (what produced vcdet.log) +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/experimental/cudf/tests/velox_cudf_decimal_expression_test \ + --gtest_filter='CudfDecimalTest.decimalGreatestLeast*' +``` + +Expected after the fix: +- `decimalGreatestLeastMixed` completes with `[ OK ]` +- No `Invalid __global__ read` / `cudaErrorLaunchFailure` +- `decimalGreatestLeastWithNulls` should also run cleanly (column-only, but good regression coverage) + +If you want, paste the post-fix sanitizer output and I can confirm it’s clean. + +--- + +**User** + +Examine the rest of the code in velox/experimental/cudf/expression and velox/experimental/cudf/tests to see if there are any other instances of the incorrect code + +--- + +**Cursor** + +Searching for column-vs-scalar `binary_operation` calls and similar patterns across the cudf expression and test code. + + +--- + +**User** + +I suspect there is a more general problem with decimal DIVISION involving scalars. Several other tests are failing with incorrect results perhaps due to a failed kernel launch, as follows: +CudfDecimalTest.decimalBinaryNullPropagation +CudfDecimalTest.decimalDivideRounds +CudfDecimalTest.decimalDivideByZero +CudfDecimalTest.decimalDivideDifferentScales +CudfDecimalTest.decimalArithmeticWithScalarRight +CudfDecimalTest.decimalArithmeticWithScalarLeft +DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div_same_scale +DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div_diff_scale +DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div_64to128 +DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div_128 +DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div_128_diff + +--- + +**Cursor** + +Tracing decimal division paths for column-vs-scalar cases and comparing them to the failing tests. + + +Your suspicion points in the right direction, but the failing tests split into **two different mechanisms**. Most of them are **not** scalar-division kernel failures. + +## What the failing tests actually exercise + +| Test | Divide form | Post-divide path | +|------|-------------|------------------| +| `decimalBinaryNullPropagation` | `a / b` (column/column) | `scatterNullsAtZeroDivisor` | +| `decimalDivideRounds` | column/column | `scatterNullsAtZeroDivisor` | +| `decimalDivideByZero` | column/column | **`scatterNullsAtZeroDivisor` (required for correctness)** | +| `decimalDivideDifferentScales` | column/column | `scatterNullsAtZeroDivisor` | +| `div_same_scale`, `div_diff_scale`, `div_64to128`, `div_128`, `div_128_diff` | column/column | `scatterNullsAtZeroDivisor` | +| `decimalArithmeticWithScalarRight` | `a / CAST('2.00' ...)` (column/scalar) | custom kernel only, **no** scatter | +| `decimalArithmeticWithScalarLeft` | `CAST(...) / a` (scalar/column) | custom kernel + `scatterNullsAtZeroDivisor` | + +So **8 of 11** failures are column/column divide. Only 2 are truly scalar-involved; scalar-right is a distinct code path. + +--- + +## Primary bug: `scatterNullsAtZeroDivisor` uses column-vs-scalar `EQUAL` + +Every column/column divide ends here: + +```149:150:velox/experimental/cudf/expression/DecimalExpressionKernels.cpp + // Scatter nulls where divisor is zero. + return scatterNullsAtZeroDivisor(std::move(out), rhs, stream, mr); +``` + +Inside `scatterNullsAtZeroDivisor`: + +```92:99:velox/experimental/cudf/expression/DecimalExpressionKernels.cpp + auto divisorIsZero = cudf::binary_operation( + divisor, + *zeroScalar, + cudf::binary_operator::EQUAL, + cudf::data_type{cudf::type_id::BOOL8}, + stream, + mr); +``` + +That is the **same class of cuDF compiled column-vs-scalar `fixed_point` bug** as `NULL_MAX` in `greatest`/`least`: the lhs is a multi-row column, the rhs is a 1-row scalar, and cuDF’s compiled kernel can OOB-read the scalar validity buffer. + +This runs even when there are **no zero divisors** (`decimalDivideRounds`, `decimalBinaryNullPropagation`), so you can get: +- corrupted `div` results, +- a poisoned CUDA stream affecting other projected columns in the same batch (`sum`, `eq`, etc.), +- or a hard sanitizer abort. + +That explains why `decimalBinaryNullPropagation` fails even though it never divides by zero. + +--- + +## Secondary path: scalar-right divide (custom kernel, different issue) + +Column/scalar-right goes through Velox’s custom kernel, **not** cuDF `binary_operation`: + +```153:193:velox/experimental/cudf/expression/DecimalExpressionKernels.cpp +std::unique_ptr decimalDivide( + const cudf::column_view& lhs, + const cudf::scalar& rhs, + ... +) { + ... + detail::launchDecimalDivideColumnRhsScalar(...); + return out; // no scatterNullsAtZeroDivisor +} +``` + +Called from `BinaryFunction`: + +```584:593:velox/experimental/cudf/expression/ExpressionEvaluator.cpp + if (op_ == cudf::binary_operator::DIV && cudf::is_fixed_point(type_)) { + if (decimalScalarIsZero(*right_, stream)) { + VELOX_USER_FAIL("Division by zero"); + } + ... + return decimalDivide(lhsView, *right_, type_, aRescale, stream, mr); + } +``` + +If `decimalArithmeticWithScalarRight` fails while ADD/SUB/MUL/MOD in the same test pass, the bug is likely in **`launchDecimalDivideColumnRhsScalar`** or **`aRescale`/scale handling**, not the cuDF column-scalar bug. + +If the **whole** test fails, it may still be stream corruption from another op—or from an earlier test if the suite doesn’t reset GPU state cleanly. + +Scalar-left divide (`decimalArithmeticWithScalarLeft`) uses the scalar/column overload, which **does** call `scatterNullsAtZeroDivisor` on the rhs column—so it hits the same `EQUAL` column-scalar bug. + +--- + +## Other column-vs-scalar decimal `binary_operation` calls (not yet fixed) + +`binaryOpColumnWithBroadcastScalar` exists only for `GreatestLeastFunction`. **`BinaryFunction` still calls cuDF column-vs-scalar directly** for decimal comparisons and arithmetic: + +```612:617:velox/experimental/cudf/expression/ExpressionEvaluator.cpp + if (right_->type() != targetType) { + auto rhsScalar = castDecimalScalar(*right_, targetType, stream, mr); + return cudf::binary_operation( + lhsView, *rhsScalar, op_, type_, stream, mr); + } + return cudf::binary_operation(lhsView, *right_, op_, type_, stream, mr); +``` + +Same pattern for ADD/SUB/MOD/MUL column-scalar paths (lines 628–658, 661–662, etc.). + +`BetweenFunction` also uses column-vs-scalar comparisons with literals: + +```941:965:velox/experimental/cudf/expression/ExpressionEvaluator.cpp + geResultColumn = cudf::binary_operation( + asView(inputColumns[0]), + *minLiteral_, + cudf::binary_operator::GREATER_EQUAL, + ... + leResultColumn = cudf::binary_operation( + asView(inputColumns[0]), + *maxLiteral_, + cudf::binary_operator::LESS_EQUAL, + ... +``` + +These are latent sanitizer/correctness risks under the same cuDF limitation, even if not in your current failure list. + +There is also unused dead code with the same pattern: + +```109:115:velox/experimental/cudf/expression/ExpressionEvaluator.cpp + auto equals = cudf::binary_operation( + col, + *zero, + cudf::binary_operator::EQUAL, + ... +``` + +(`hasDecimalZero` is never called.) + +--- + +## Recommended fix strategy (conceptual) + +1. **Fix `scatterNullsAtZeroDivisor` first** — broadcast the zero scalar to a column before `EQUAL`, or avoid cuDF entirely (host check for scalar divisor; custom GPU kernel/bitmap for column divisor). This should unblock most of your listed failures. + +2. **Route all decimal column-vs-scalar cuDF ops through `binaryOpColumnWithBroadcastScalar`** (or equivalent) in `BinaryFunction` and `BetweenFunction`, not just `greatest`/`least`. + +3. **Re-evaluate scalar-right divide separately** if `div_r` still fails after (1): inspect `aRescale` and `launchDecimalDivideColumnRhsScalar`; that path intentionally skips zero scattering because scalar zero throws upfront. + +4. **Optional robustness**: have divide kernels skip writing rows where the combined null mask is unset, so garbage in null slots cannot leak if a downstream step ignores masks. + +--- + +## How to confirm in your container + +Run these in isolation under compute-sanitizer: + +```bash +# Should hit scatterNullsAtZeroDivisor (column-scalar EQUAL) +--gtest_filter='CudfDecimalTest.decimalDivideByZero' + +# Scalar-right only; should NOT hit scatterNullsAtZeroDivisor +--gtest_filter='CudfDecimalTest.decimalArithmeticWithScalarRight' + +# Scalar-left; hits scatterNullsAtZeroDivisor on rhs column +--gtest_filter='CudfDecimalTest.decimalArithmeticWithScalarLeft' +``` + +If sanitizer fires in `scatterNullsAtZeroDivisor` → `binary_operation` → `EQUAL` for the first and third, but not for scalar-right divide, that confirms the split above. + +--- + +**Bottom line:** the failures are less “decimal division involving scalars” in general, and more **“decimal division always post-processes with a column-vs-scalar cuDF op that is broken for `fixed_point`.”** Scalar-right divide is a separate, smaller surface. Fixing `scatterNullsAtZeroDivisor` plus systematically broadcasting scalars for decimal `binary_operation` should address the bulk of what you’re seeing. + +I’m in Ask mode, so I can’t apply those changes here—switch to Agent mode if you want me to implement them. + +--- + +**User** + +Please go ahead and implement the fix. Please undo or generalize the earlier fix for GreatestLeastFunction if appropriate. + +--- + +**Cursor** + +Generalizing the scalar-broadcast helper and applying it across decimal binary ops and `scatterNullsAtZeroDivisor`. Reviewing the codebase to plan the changes. + + +Creating shared binary-op utilities and applying them across the codebase. + + +Implemented a generalized fix for cuDF’s broken decimal column-vs-scalar `binary_operation` path. + +## What changed + +### New shared utilities: `CudfBinaryOpUtils.h` / `.cpp` +- **`castDecimalScalar`** — moved out of `ExpressionEvaluator.cpp` +- **`binaryOpColumnWithBroadcastScalar`** — broadcasts scalar → column, then column/column op +- **`binaryOpScalarWithBroadcastColumn`** — symmetric scalar-on-left path + +### `scatterNullsAtZeroDivisor` (`DecimalExpressionKernels.cpp`) +Zero-divisor detection now uses broadcast `EQUAL` instead of column-vs-scalar `EQUAL`. This was the main cause of divide test failures. + +### `BinaryFunction` (`ExpressionEvaluator.cpp`) +All decimal column-vs-scalar paths (comparisons, ADD/SUB/MOD/MUL, fallbacks) use the broadcast helpers. Scalar-left paths use `binaryOpScalarWithBroadcastColumn`. + +### `BetweenFunction` +Decimal `BETWEEN` with literal bounds uses broadcast for `>=` / `<=`. + +### `GreatestLeastFunction` +Uses the shared helper plus explicit decimal type alignment before broadcast (replacing the earlier inline-only fix). + +### Cleanup +Removed unused `hasDecimalZero` and the duplicate helper definitions from `ExpressionEvaluator.cpp`. + +--- + +## Rebuild & verify (in your container) + +```bash +cmake --build /opt/velox-build/release --target velox_cudf_decimal_expression_test -j + +/opt/velox-build/release/velox/experimental/cudf/tests/velox_cudf_decimal_expression_test \ + --gtest_filter='CudfDecimalTest.decimalDivide*:CudfDecimalTest.decimalBinaryNullPropagation:CudfDecimalTest.decimalArithmeticWithScalar*:CudfDecimalTest.decimalGreatestLeast*:DecimalBinaryOps/CudfDecimalBinaryTest.cpuGpuMatch/div*' + +# Optional sanitizer spot-check +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/experimental/cudf/tests/velox_cudf_decimal_expression_test \ + --gtest_filter='CudfDecimalTest.decimalDivideByZero:CudfDecimalTest.decimalGreatestLeastMixed' +``` + +--- + +**User** + +Do you believe there is an underlying bug in cudf? I would rather not having to convert every scalar to a column as that is very inefficient + +--- + +**Cursor** + +Yes — the sanitizer evidence points to a cuDF bug, not a Velox scalar-construction issue. Checking how cuDF is used in this repo and whether safer alternatives exist. + + +Yes — I believe there is an underlying **cuDF bug**, not a Velox scalar-construction bug. The broadcast workaround is a reasonable short-term mitigation, but it is broader and more expensive than it needs to be. + +## Why this looks like cuDF + +The sanitizer signature from `vcdet.log` is very specific: + +- Kernel: `cudf::binops::compiled::ops::NullMax` on `numeric::fixed_point` (decimal64) +- API path: `cudf::binary_operation(column, scalar, …)` +- Error: **4-byte read** against a **1-byte allocation** at the scalar’s address + +That pattern strongly suggests the compiled kernel is iterating over column rows but indexing into a **scalar validity/value buffer** as if it were a column null mask or column data. That is not something Velox can cause by building the scalar wrong — especially since `makeScalarFromValue` already syncs the stream after scalar allocation. + +The same failure mode fits `scatterNullsAtZeroDivisor`, which used: + +```cpp +cudf::binary_operation(divisor, *zeroScalar, EQUAL, BOOL) +``` + +That explains divide failures even for **column/column** divide: the divide kernel itself is Velox’s custom code; the post-pass zero check hits cuDF’s broken column-vs-scalar path. + +Velox pins **cuDF 26.06** (`CMake/resolve_dependency_modules/cudf.cmake`), so this is worth reporting upstream with a minimal repro against that version. + +## Is *every* column-vs-scalar decimal op broken? + +Probably **not all of them**. The evidence points at cuDF’s **compiled** binop path for `fixed_point`, and likely specific ops such as: + +- `NULL_MAX` / `NULL_MIN` +- `EQUAL` (and possibly other comparisons) + +Arithmetic ops (`ADD`, `SUB`, `MUL`, `MOD`) may use different compiled specializations that happen to work. That matters because the current Velox fix broadcasts for **all** decimal column-vs-scalar ops in `BinaryFunction`, which is likely overkill. + +Your custom divide path already avoids cuDF for the actual division: + +```153:193:velox/experimental/cudf/expression/DecimalExpressionKernels.cpp +// column/scalar divide uses launchDecimalDivideColumnRhsScalar, not cudf::binary_operation +``` + +So “decimal division involving scalars” failures were mostly from **`scatterNullsAtZeroDivisor`**, not from the divide kernel itself. + +## Why broadcast is inefficient + +`make_column_from_scalar` + column/column op costs: + +- **O(n) extra device memory** per op +- An extra **fill/broadcast kernel** +- Higher bandwidth and worse cache behavior vs a true scalar kernel + +For batch sizes of millions of rows and expressions with many literal folds (`greatest(a, 5.00, b)`, `BETWEEN`, zero checks after divide), that adds up. + +## Better strategies (in order of preference) + +**1. Fix upstream in cuDF (best)** +Minimal repro: + +```cpp +// column of DECIMAL(10,2), n rows +// scalar constant +cudf::binary_operation(col, scalar, NULL_MAX); // or EQUAL +``` + +Run under compute-sanitizer. Once fixed, Velox can drop most broadcasts. + +**2. Targeted workarounds only for known-broken ops** +Keep broadcast (or alternatives below) for `NULL_MAX`, `NULL_MIN`, and `EQUAL` on decimals. Leave `ADD`/`SUB`/`MUL`/`MOD` on the native column-scalar path if tests/sanitizer confirm they’re safe. + +**3. Custom kernels where you already have infrastructure** +You already do this for decimal divide in `DecimalExpressionKernelsGpu.cu`. Same pattern for: + +- `scatterNullsAtZeroDivisor`: a small kernel `out_null_mask[i] |= (divisor[i] == 0)` — no cuDF binop, no broadcast +- `greatest`/`least` with literals: fold constants on host, then one custom null-aware max/min kernel + +**4. Host-side fast paths when the scalar is truly scalar** +Examples: + +- **Scalar divisor zero**: `decimalScalarIsZero` on host → throw or all-null column (you already do this for `column / scalar`) +- **Single literal in `greatest`/`least`**: compare on host, no GPU op needed for that step + +**5. cuDF AST expressions instead of compiled binops** +Velox already has an AST path (`AstExpressionUtils.h`, `JitExpression`). Whether AST avoids the compiled column-scalar bug depends on cuDF’s AST evaluator — worth checking, but not guaranteed. + +## Practical recommendation + +| Approach | Verdict | +|----------|---------| +| Broadcast everywhere (current fix) | Safe, but likely **over-scoped and inefficient** | +| Upstream cuDF fix | **Correct long-term fix** | +| Narrow broadcast to broken ops only | Good **medium-term** compromise | +| Custom kernels / host shortcuts for hot paths | Best **Velox-side** performance if upstream is slow | + +I would **file a cuDF issue** with the sanitizer stack trace and a 10-line repro, and in Velox **narrow the workaround** to the ops that are actually broken (`EQUAL`, `NULL_MAX`, `NULL_MIN`), replacing `scatterNullsAtZeroDivisor`’s GPU `EQUAL` with a tiny custom kernel so divide doesn’t pay the broadcast tax at all. + +If you want, I can outline the exact minimal cuDF repro and a narrower Velox patch plan (in Agent mode I could implement the targeted version). + +--- + +**User** + +I have provided the cudf source code at /home/seves/work/rapidsai/cudf. Please examine the related code paths to see if you can find the bug. + +--- + +**Cursor** + +Searching the cuDF source for the compiled column-vs-scalar fixed_point binary op path. + + +Yes — there is a real cuDF bug, and it explains the sanitizer signature exactly. It is **not** a Velox scalar-construction issue. The broadcast workaround avoids it, but it is broader than necessary. + +## Root cause + +In `scalar_as_column_view`, cuDF wraps a scalar as a 1-row `column_view` for the compiled binary-op kernels: + +```45:49:/home/seves/work/rapidsai/cudf/cpp/src/binaryop/compiled/binary_ops.cu + auto col_v = column_view(s.type(), + 1, + h_scalar_type_view.data(), + reinterpret_cast(s.validity_data()), + !s.is_valid(stream)); +``` + +The problem: **scalar validity is a single device `bool`, not a column bitmask.** + +From `cudf::scalar`: + +```90:90:/home/seves/work/rapidsai/cudf/cpp/include/cudf/scalar/scalar.hpp + cudf::detail::device_scalar _is_valid; ///< Device bool signifying validity +``` + +But `column_view` treats `_null_mask` as a **`bitmask_type*`**, and validity checks use `bit_is_set`, which reads a full 32-bit word: + +```102:106:/home/seves/work/rapidsai/cudf/cpp/include/cudf/utilities/bit.hpp +CUDF_HOST_DEVICE inline bool bit_is_set(bitmask_type const* bitmask, size_type bit_index) +{ + assert(nullptr != bitmask); + return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index)); +} +``` + +So when a null-aware kernel calls `rhs.is_valid(0)` on a scalar-as-column, it does a **4-byte read from a 1-byte `bool` allocation** — exactly what compute-sanitizer reported (`size 1 bytes`, `read of size 4 bytes`). + +## Where it blows up in the kernel + +Null-aware ops (`NULL_MAX`, `NULL_MIN`, `NULL_EQUALS`, etc.) explicitly query operand validity in `ops_wrapper`: + +```112:124:/home/seves/work/rapidsai/cudf/cpp/src/binaryop/compiled/binary_ops.cuh + if constexpr (cuda::std::is_same_v or + ... + cuda::std::is_same_v or + cuda::std::is_same_v) { + bool output_valid = false; + auto result = BinaryOperator{}.template operator()( + x, + y, + lhs.is_valid(is_lhs_scalar ? 0 : i), + rhs.is_valid(is_rhs_scalar ? 0 : i), + output_valid); +``` + +The vector-scalar entry point correctly sets `is_rhs_scalar = true`: + +```373:380:/home/seves/work/rapidsai/cudf/cpp/src/binaryop/compiled/binary_ops.cu +void binary_operation(mutable_column_view& out, + column_view const& lhs, + scalar const& rhs, + binary_operator op, + rmm::cuda_stream_view stream) +{ + auto [rhsv, aux] = scalar_to_column_view(rhs, stream); + operator_dispatcher(out, lhs, rhsv, false, true, op, stream); +} +``` + +Index selection is correct (`0` for scalar), but **`is_valid(0)` still uses the bitmask path** on memory that is only a `bool`. + +## Why some ops work and others do not + +| Op class | Reads scalar validity in kernel? | Column-scalar fixed_point | +|----------|----------------------------------|---------------------------| +| `ADD`, `SUB`, `MUL`, `DIV`, `MOD`, `LESS`, `GREATER`, `EQUAL`, … | No — only values; output null mask computed on host | Works (cuDF has 1000-row tests) | +| `NULL_MAX`, `NULL_MIN`, `NULL_EQUALS`, `NULL_LOGICAL_*` | Yes — calls `is_valid()` per row | **Broken OOB read** | + +That matches your experience: +- `greatest`/`least` → `NULL_MAX`/`NULL_MIN` → hits the bug +- `decimalArithmeticWithScalarRight` with `+`, `-`, `*`, `/`, `%` → should use non-null-aware paths → cuDF tests pass +- `scatterNullsAtZeroDivisor` uses `EQUAL` → **non-null-aware** `ops::Equal` → likely does **not** hit this specific validity bug (divide failures may be a separate issue, or stream corruption from an earlier null-aware op in the same run) + +## Test gap in cuDF + +cuDF has: +- column/column `NULL_MAX` for fixed_point (`FixedPointBinaryOpNullMaxSimple`) +- column/scalar `ADD`/`DIV`/`MOD` with 1000 rows +- column/scalar comparisons + +It does **not** appear to have column/scalar `NULL_MAX`/`NULL_MIN` on fixed_point with multi-row columns — which is exactly the Velox `greatest(a, literal, b)` case. + +## Proper cuDF fix (no broadcast needed) + +Any of these would work: + +1. **Fix `scalar_as_column_view`** — build a real 1-bit null mask (e.g. via `make_column_from_scalar(s, 1, ...)`) instead of casting `bool*` → `bitmask_type*`. + +2. **Fix validity lookup when `is_*_scalar`** — in `ops_wrapper`, if `is_rhs_scalar`, read scalar validity as a `bool` (or pass a `scalar_device_view`) instead of `column_device_view::is_valid()`. + +3. **Host-side for valid scalars** — pass `null_mask = nullptr`, `null_count = 0` so `nullable()` is false and `is_valid()` short-circuits to true (only handles non-null scalars; null scalars still need a real mask or special casing). + +Option 1 is the smallest localized fix and matches what your Velox workaround already does implicitly. + +## Implication for Velox + +You do **not** need to broadcast scalars for all decimal column-scalar ops. A narrower strategy: + +- **Keep broadcast (or cuDF fix)** only for null-aware ops: `NULL_MAX`, `NULL_MIN`, and possibly `NULL_EQUALS` / `NULL_LOGICAL_*` if you ever use them with scalars. +- **Revert broadcast** for `ADD`/`SUB`/`MUL`/`MOD`/comparisons — cuDF’s native column-scalar path is fine and tested at scale. +- For **divide zero-check** (`scatterNullsAtZeroDivisor`), `EQUAL` is probably OK on the kernel side; a tiny custom “divisor == 0” kernel would avoid cuDF entirely and be cheaper than broadcast. + +## Minimal upstream repro to file + +```cpp +// col: DECIMAL64, N rows (N > 1) +// scalar: same type, valid constant +auto out_type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::NULL_MAX, col.type(), scalar->type()); +auto result = cudf::binary_operation(col, *scalar, cudf::binary_operator::NULL_MAX, out_type); +// Run under compute-sanitizer --tool memcheck +``` + +**Bottom line:** this is a cuDF bug in `scalar_as_column_view` conflating scalar `bool` validity with column bitmask validity, triggered only by null-aware compiled binops. Filing against RAPIDS/cudf with the above repro is warranted; Velox can use a much narrower workaround until upstream is fixed. + +--- + +**User** + +Please write a full implementation of the minimal reproduction, that I can build in the same container. Please create this in a new sub-directory under /home/seves/work/velox/velox (you choose the name). + +--- + +**Cursor** + +Creating a minimal cuDF repro buildable in the Velox container. Checking how the project links against cuDF. + + +Creating the repro directory and integrating it into the Velox CUDA build. + + +Created **`velox/cudf-binop-scalar-repro/`** — a minimal cuDF repro integrated into the Velox CUDA build. + +## Contents + +| File | Purpose | +|------|---------| +| `main.cpp` | Runs 5 column-vs-scalar `DECIMAL64` cases (`NULL_MAX`, `NULL_MIN`, `NULL_EQUALS`, plus `EQUAL`/`ADD` controls) | +| `CMakeLists.txt` | Builds `cudf_nullmax_scalar_repro` linked against `cudf::cudf` | +| `README.md` | Build, run, and sanitizer instructions | + +`velox/CMakeLists.txt` now adds this subdirectory when `VELOX_ENABLE_CUDF=ON`. + +## Build & run (in your container) + +Reconfigure if CMake already ran before this directory existed: + +```bash +cmake --build /opt/velox-build/release --target cudf_nullmax_scalar_repro -j +``` + +```bash +/opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro +``` + +Single case: + +```bash +.../cudf_nullmax_scalar_repro --only null_max +``` + +Under compute-sanitizer: + +```bash +compute-sanitizer --tool memcheck \ + /opt/velox-build/release/velox/cudf-binop-scalar-repro/cudf_nullmax_scalar_repro +``` + +**Expected:** `null_max`, `null_min`, and `null_equals` hit the 4-byte read from a 1-byte scalar validity allocation; `equal` and `add` stay clean. That matches the `scalar_as_column_view` / `bit_is_set` analysis and is suitable to attach to a RAPIDS/cudf issue. + +--- + +**User** + +CMake Error in CMakeLists.txt: + No cmake_minimum_required command is present. A line of code such as + + cmake_minimum_required(VERSION 4.3) + + +--- + +**Cursor** + +Checking the repro's CMakeLists and how Velox configures CUDA/cuDF so we can fix the CMake error. + + +Adding `cmake_minimum_required` and standalone-build support — likely caused by running `cmake` directly in the repro directory. + + +Fixed. The repro’s `CMakeLists.txt` was missing `cmake_minimum_required`, which CMake 4.x requires when you configure that directory directly. + +It now has: +- `cmake_minimum_required(VERSION 3.30.4)` (matches cuDF) +- **In-tree** build unchanged: `cmake --build /opt/velox-build/release --target cudf_nullmax_scalar_repro -j` +- **Standalone** configure from `velox/cudf-binop-scalar-repro/`: + +```bash +cmake -S . -B build \ + -DCMAKE_PREFIX_PATH=/opt/velox-build/release \ + -DCMAKE_CUDA_ARCHITECTURES=80 +cmake --build build -j +``` + +Use your GPU’s SM version instead of `80` if needed. The README documents both paths. + +--- + +**User** + +Please save out two separate plan files from the above session. One for the full proposed changes on the Velox side to avoid the likely cudf bug. The other for your analysis and proposal for fixing the cudf bug itself, including the repro. + +--- + +**Cursor** + +Reviewing the current Velox workaround and repro so the two plan files match what's in the tree. + + +--- + +**User** + +Note that I temporarily stashed some of the Velox source code changes. + +--- + +**Cursor** + +Checking what's still in the tree versus stashed, then writing both plan files. + + +Writing both plan files, noting the stashed state and what remains in the tree. + + +Both plan files are in `velox/cudf-binop-scalar-repro/`: + +1. **[PLAN-VELOX-WORKAROUND.md](velox/cudf-binop-scalar-repro/PLAN-VELOX-WORKAROUND.md)** — narrow Velox workaround (null-aware ops only), call sites, checklist, and what to re-apply from stash vs skip +2. **[PLAN-CUDF-UPSTREAM-FIX.md](velox/cudf-binop-scalar-repro/PLAN-CUDF-UPSTREAM-FIX.md)** — cuDF root cause, fix options, repro/sanitizer steps, and upstream issue template + +**Current tree vs stash** + +| Item | Status | +|------|--------| +| `CudfBinaryOpUtils.{h,cpp}` | Still on disk | +| CMake / `ExpressionEvaluator` integration | Stashed — `GreatestLeastFunction` still uses direct `binary_operation`; local duplicate `castDecimalScalar` remains | +| `cudf-binop-scalar-repro/` | Still present (repro + both plans) | + +When you pop the stash, the Velox plan recommends applying only **Tier 1** (null-aware ops + `GreatestLeastFunction`) rather than broadcasting all decimal column-scalar paths. + +Want me to re-apply the narrow workaround next, or help review what’s in the stash first? + +--- + +**User** + +Please re-do the plan save. I have popped the stash, so the code should now all be as you left it. + +--- + +**Cursor** + +Verifying the codebase after the stash pop, then updating both plan files to match. + + +Updating both plan files to match the restored code and recreating the repro sources (only build artifacts remained). + + +Both plan files are updated to match the restored code after the stash pop. The repro sources were missing (only CMake cache/build artifacts remained), so those were recreated as well. + +**Plans** (`velox/cudf-binop-scalar-repro/`): + +| File | Contents | +|------|----------| +| `PLAN-VELOX-WORKAROUND.md` | **Implemented** workaround: `CudfBinaryOpUtils`, all call sites (`BinaryFunction`, `BetweenFunction`, `GreatestLeastFunction`, `scatterNullsAtZeroDivisor`), verification commands, optional future narrowing | +| `PLAN-CUDF-UPSTREAM-FIX.md` | cuDF root cause, fix options, repro/sanitizer steps, upstream issue template | + +**Also restored** (referenced by the plans and `velox/CMakeLists.txt`): + +- `main.cpp`, `CMakeLists.txt`, `README.md` + +Key change from the previous draft: plans no longer say “stashed / partial” — they document the **full implemented state**, including the broader broadcast strategy (all decimal column-scalar binops, not just null-aware ops), with a note that a narrower fix is possible after cuDF upstream lands. + +--- + +**User** + +The repro code you added does not build in-tree: + + + +--- + +**Cursor** + +Investigating the in-tree build failure for the repro target. + + +--- + +**User** + +[1/3] Building CXX object velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o +FAILED: [code=1] velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o +/usr/bin/ccache /opt/rh/gcc-toolset-14/root/usr/bin/g++ -DBS_THREAD_POOL_ENABLE_PAUSE=1 -DCCCL_DISABLE_PDL -DCUB_DISABLE_NAMESPACE_MAGIC -DCUB_IGNORE_NAMESPACE_MAGIC_ERROR -DFOLLY_HAS_COROUTINES=1 -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR -DVELOX_ENABLE_ABFS -DVELOX_ENABLE_GCS -DVELOX_ENABLE_HDFS -DVELOX_ENABLE_PARQUET -DVELOX_ENABLE_S3 -I/workspace/velox/. -I/workspace/velox/velox/external/xxhash -I/opt/velox-build/release/_deps/dlpack-src/include -I/opt/velox-build/release/_deps/jitify-src -I/opt/velox-build/release/_deps/cudf-src/cpp/include -I/opt/velox-build/release/_deps/cudf-build/include -I/opt/velox-build/release/_deps/cccl-src/lib/cmake/thrust/../../../thrust -I/opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include -I/opt/velox-build/release/_deps/cccl-src/lib/cmake/cub/../../../cub -I/opt/velox-build/release/_deps/rapids_logger-src/include -I/opt/velox-build/release/_deps/rmm-src/cpp/include -I/opt/velox-build/release/_deps/rmm-build/include -I/opt/velox-build/release/_deps/nvtx3-src/c/include -I/opt/velox-build/release/_deps/bs_thread_pool-src/include -isystem /workspace/velox/velox -isystem /workspace/velox/velox/external -isystem /usr/local/cuda-12.9/targets/x86_64-linux/include -mavx2 -mfma -mavx -mf16c -mlzcnt -mbmi2 -D USE_VELOX_COMMON_BASE -D HAS_UNCAUGHT_EXCEPTIONS -Wall -Wextra -Wno-unused -Wno-unused-parameter -Wno-sign-compare -Wno-ignored-qualifiers -Wno-implicit-fallthrough -Wno-class-memaccess -Wno-comment -Wno-int-in-bool-context -Wno-redundant-move -Wno-array-bounds -Wno-maybe-uninitialized -Wno-unused-result -Wno-format-overflow -Wno-strict-aliasing -Wno-error=template-id-cdtor -Wno-overloaded-virtual -Wno-error=tautological-compare -Werror -O3 -DNDEBUG -std=gnu++20 -fPIE -fdiagnostics-color=always -fcoroutines -Wno-missing-field-initializers -MD -MT velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o -MF velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o.d -fmodules-ts -fmodule-mapper=velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o.modmap -MD -fdeps-format=p1689r5 -x c++ -o velox/cudf-binop-scalar-repro/CMakeFiles/cudf_nullmax_scalar_repro.dir/main.cpp.o -c /workspace/velox/velox/cudf-binop-scalar-repro/main.cpp +/workspace/velox/velox/cudf-binop-scalar-repro/main.cpp: In function 'void {anonymous}::run_case(const char*, cudf::binary_operator, bool)': +/workspace/velox/velox/cudf-binop-scalar-repro/main.cpp:107:29: error: no matching function for call to 'binary_operation(cudf::column_view, cudf::scalar&, cudf::binary_operator&, cudf::data_type&, rmm::cuda_stream_view, rmm::device_async_resource_ref (&)())' + 107 | cudf::binary_operation(col->view(), *scalar, op, out_type, stream(), mr); + | ~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In file included from /workspace/velox/velox/cudf-binop-scalar-repro/main.cpp:21: +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:155:25: note: candidate: 'std::unique_ptr cudf::binary_operation(const scalar&, const column_view&, binary_operator, data_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)' + 155 | std::unique_ptr binary_operation( + | ^~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:156:17: note: no known conversion for argument 1 from 'cudf::column_view' to 'const cudf::scalar&' + 156 | scalar const& lhs, + | ~~~~~~~~~~~~~~^~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:186:25: note: candidate: 'std::unique_ptr cudf::binary_operation(const column_view&, const scalar&, binary_operator, data_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)' + 186 | std::unique_ptr binary_operation( + | ^~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:192:34: note: no known conversion for argument 6 from 'rmm::device_async_resource_ref()' {aka 'cuda::mr::__4::__version_bump_ver4_::resource_ref()'} to 'rmm::device_async_resource_ref' {aka 'cuda::mr::__4::__version_bump_ver4_::resource_ref'} + 192 | rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:216:25: note: candidate: 'std::unique_ptr cudf::binary_operation(const column_view&, const column_view&, binary_operator, data_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)' + 216 | std::unique_ptr binary_operation( + | ^~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:218:22: note: no known conversion for argument 2 from 'cudf::scalar' to 'const cudf::column_view&' + 218 | column_view const& rhs, + | ~~~~~~~~~~~~~~~~~~~^~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:247:25: note: candidate: 'std::unique_ptr cudf::binary_operation(const column_view&, const column_view&, const std::string&, data_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)' + 247 | std::unique_ptr binary_operation( + | ^~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:249:22: note: no known conversion for argument 2 from 'cudf::scalar' to 'const cudf::column_view&' + 249 | column_view const& rhs, + | ~~~~~~~~~~~~~~~~~~~^~~ +In file included from /opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__memory_resource/resource.h:28, + from /opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__memory_resource/any_resource.h:28, + from /opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__memory_pool/memory_pool_base.h:28, + from /opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__memory_pool/device_memory_pool.h:26, + from /opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/memory_resource:29, + from /opt/velox-build/release/_deps/rmm-src/cpp/include/rmm/mr/cuda_memory_resource.hpp:11, + from /opt/velox-build/release/_deps/rmm-src/cpp/include/rmm/mr/per_device_resource.hpp:11, + from /opt/velox-build/release/_deps/rmm-src/cpp/include/rmm/device_buffer.hpp:12, + from /opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/utilities/span.hpp:11, + from /opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/column/column_view.hpp:10, + from /opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/column/column.hpp:7, + from /opt/velox-build/release/_deps/cudf-src/cpp/include/cudf/binaryop.hpp:8: +/opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__utility/__basic_any/semiregular.h: In instantiation of 'cuda::std::__4::enable_if_t, bool> cuda::__4::__equal_fn(const _Tp&, cuda::std::__4::__type_info_ref, const void*) [with _Tp = cuda::mr::__4::__version_bump_ver4_::resource_ref(); cuda::std::__4::enable_if_t, bool> = bool; cuda::std::__4::__type_info_ref = const std::type_info&]': +/opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__utility/__basic_any/semiregular.h:247:9: required by substitution of 'template requires equality_comparable<_Tp> using cuda::__4::iequality_comparable_base::overrides = cuda::__4::__overrides_for<_Tp, (& __equal_fn<_Tp>)> [with _Tp = cuda::mr::__4::__version_bump_ver4_::resource_ref()]' + 247 | using overrides _CCCL_NODEBUG_ALIAS = __overrides_for<_Tp, &::cuda::__equal_fn<_Tp>>; + | ^~~~~~~~~ +/opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/__utility/__basic_any/overrides.h:33:7: required by substitution of 'template using cuda::__4::__overrides_for_t = typename _Interface::overrides<_Tp> [with _Interface = cuda::__4::__iequality_comparable<>; _Tp = cuda::mr::__4::__version_bump_ver4_::resource_ref()]' + 33 | using __overrides_for_t _CCCL_NODEBUG_ALIAS = typename _Interface::template overrides<_Tp>; + | ^~~~~~~~~~~~~~~~~ +/opt/velox-build/release/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include/cuda/std/__type_traits/is_valid_expansion.h:29:61: required by substitution of 'template class _Templ, class ... _Args, class> cuda::std::__4::true_type cuda::std::__4::__sfinae_test_impl(int) [with _Templ = cuda::__4::__overrides_for_t; _Args = {cuda::__4::__iequality_comparable<>, cuda::mr::__4::__version_bump_ver4_::resource_ref()}; = cuda::__4::__overrides_list(), cuda::std::__4::integral_constant (&)(), const std::type_info&, const void*), cuda::__4::__equal_fn()> > >]' + 29 | template