-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-47769: [C++] SVE dynamic dispatch #49756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f2be539
1d18257
61fa3dc
9b7e49c
9c4ec4b
26ed7eb
3a3cccb
2f69dcd
f031c68
8191258
9032864
b04f250
9669fc9
80e0ab2
b1cb14b
cbf526f
defc062
1ebc10c
7ecbd23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,16 +33,29 @@ struct UnpackDynamicFunction { | |
|
|
||
| static constexpr auto implementations() { | ||
| return std::array{ | ||
| // x86 implementations | ||
| #if defined(ARROW_HAVE_SSE4_2) | ||
| Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>}, | ||
| #else | ||
| Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>}, | ||
| #endif | ||
| #if defined(ARROW_HAVE_RUNTIME_AVX2) | ||
| # if defined(ARROW_HAVE_RUNTIME_AVX2) | ||
| Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>}, | ||
| #endif | ||
| #if defined(ARROW_HAVE_RUNTIME_AVX512) | ||
| # endif | ||
| # if defined(ARROW_HAVE_RUNTIME_AVX512) | ||
| Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>}, | ||
| # endif | ||
|
|
||
| // ARM implementations | ||
| #elif defined(ARROW_HAVE_NEON) | ||
| Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>}, | ||
| # if defined(ARROW_HAVE_RUNTIME_SVE128) | ||
| Implementation{DispatchLevel::SVE128, &bpacking::unpack_sve128<Uint>}, | ||
| # endif | ||
| # if defined(ARROW_HAVE_RUNTIME_SVE256) | ||
|
AntoinePrv marked this conversation as resolved.
|
||
| Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>}, | ||
| # endif | ||
|
|
||
| // Other implementations | ||
| #else | ||
| Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>}, | ||
| #endif | ||
| }; | ||
| } | ||
|
|
@@ -52,12 +65,14 @@ struct UnpackDynamicFunction { | |
|
|
||
| template <typename Uint> | ||
| void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) { | ||
| #if defined(ARROW_HAVE_NEON) | ||
| return bpacking::unpack_neon(in, out, opts); | ||
| #else | ||
| static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch; | ||
| return dispatch.func(in, out, opts); | ||
| #endif | ||
| auto constexpr kImplementations = UnpackDynamicFunction<Uint>::implementations(); | ||
| if constexpr (kImplementations.size() == 1) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this condition actually useful? I guess it's a shortcut, but it's not obvious that it applies to common cases (x86 or ARM with default SIMD options). At worse, this could be added generically to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is worth it to avoid additional
Actually done in GH-49840 so either way here (we'd need to adapt the PR that is not merged first).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That PR might prove difficult to adapt for all the lousy compilers we have to support, so I'd rather focus on this one first :) |
||
| constexpr auto func = kImplementations.front().second; | ||
| func(in, out, opts); | ||
| } else { | ||
| static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch; | ||
| return dispatch.func(in, out, opts); | ||
| } | ||
| } | ||
|
|
||
| template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ | |
| #include "arrow/util/bpacking_scalar_internal.h" | ||
| #include "arrow/util/bpacking_simd_internal.h" | ||
|
|
||
| #if defined(ARROW_HAVE_RUNTIME_AVX2) | ||
| #if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128) | ||
| # include "arrow/util/cpu_info.h" | ||
| #endif | ||
|
|
||
|
|
@@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo | |
| // will not emit runs larger than 512 (though other implementation might), so we biased | ||
| // the benchmarks towards a rather small scale. | ||
| static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2); | ||
| constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8}; | ||
| constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13}; | ||
| constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20}; | ||
| constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47}; | ||
| constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8}; | ||
| constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13}; | ||
| constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20}; | ||
| constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47}; | ||
|
|
||
| static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = { | ||
| {0, 1}, | ||
|
|
@@ -159,125 +159,69 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t> | |
| return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg)); | ||
| } | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &bpacking::unpack_scalar<bool>) | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, | ||
| &bpacking::unpack_scalar<uint8_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, | ||
| &bpacking::unpack_scalar<uint16_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, | ||
| &bpacking::unpack_scalar<uint32_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, | ||
| &bpacking::unpack_scalar<uint64_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| // Register BM_Unpack{Bool,Uint8,Uint16,Uint32,Uint64} benchmarks for a given | ||
| // UNPACK_FUNC templated on each of those types, with explicit skip args. | ||
| #define BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, SKIP, SKIP_MSG) \ | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, LABEL, ALIGNED, &UNPACK_FUNC<bool>, SKIP, SKIP_MSG) \ | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); \ | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, LABEL, ALIGNED, &UNPACK_FUNC<uint8_t>, SKIP, \ | ||
| SKIP_MSG) \ | ||
| ->ArgsProduct(kBitWidthsNumValues8); \ | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, LABEL, ALIGNED, &UNPACK_FUNC<uint16_t>, SKIP, \ | ||
| SKIP_MSG) \ | ||
| ->ArgsProduct(kBitWidthsNumValues16); \ | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, LABEL, ALIGNED, &UNPACK_FUNC<uint32_t>, SKIP, \ | ||
| SKIP_MSG) \ | ||
| ->ArgsProduct(kBitWidthsNumValues32); \ | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, LABEL, ALIGNED, &UNPACK_FUNC<uint64_t>, SKIP, \ | ||
| SKIP_MSG) \ | ||
| ->ArgsProduct(kBitWidthsNumValues64) | ||
|
|
||
| #define BENCHMARK_UNPACK_ALL_TYPES(LABEL, ALIGNED, UNPACK_FUNC) \ | ||
| BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, false, "") | ||
|
|
||
| #define BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(LABEL, ALIGNED, UNPACK_FUNC, CPU_FEATURE, \ | ||
| SKIP_MSG) \ | ||
| BENCHMARK_UNPACK_ALL_TYPES_SKIP( \ | ||
| LABEL, ALIGNED, UNPACK_FUNC, \ | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::CPU_FEATURE), SKIP_MSG) | ||
|
|
||
| BENCHMARK_UNPACK_ALL_TYPES(ScalarUnaligned, false, bpacking::unpack_scalar); | ||
|
|
||
| #if defined(ARROW_HAVE_SSE4_2) | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &bpacking::unpack_sse4_2<bool>) | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, | ||
| &bpacking::unpack_sse4_2<uint8_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, | ||
| &bpacking::unpack_sse4_2<uint16_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, | ||
| &bpacking::unpack_sse4_2<uint32_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, | ||
| &bpacking::unpack_sse4_2<uint64_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| BENCHMARK_UNPACK_ALL_TYPES(Sse42Unaligned, false, bpacking::unpack_sse4_2); | ||
| #endif | ||
|
|
||
| #if defined(ARROW_HAVE_RUNTIME_AVX2) | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &bpacking::unpack_avx2<bool>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), | ||
| "Avx2 not available") | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &bpacking::unpack_avx2<uint8_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), | ||
| "Avx2 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &bpacking::unpack_avx2<uint16_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), | ||
| "Avx2 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &bpacking::unpack_avx2<uint32_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), | ||
| "Avx2 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &bpacking::unpack_avx2<uint64_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), | ||
| "Avx2 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx2Unaligned, false, bpacking::unpack_avx2, AVX2, | ||
| "Avx2 not available"); | ||
| #endif | ||
|
|
||
| #if defined(ARROW_HAVE_RUNTIME_AVX512) | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &bpacking::unpack_avx512<bool>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), | ||
| "Avx512 not available") | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, | ||
| &bpacking::unpack_avx512<uint8_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), | ||
| "Avx512 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, | ||
| &bpacking::unpack_avx512<uint16_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), | ||
| "Avx512 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, | ||
| &bpacking::unpack_avx512<uint32_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), | ||
| "Avx512 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, | ||
| &bpacking::unpack_avx512<uint64_t>, | ||
| !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), | ||
| "Avx512 not available") | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx512Unaligned, false, bpacking::unpack_avx512, | ||
| AVX512, "Avx512 not available"); | ||
| #endif | ||
|
|
||
| #if defined(ARROW_HAVE_NEON) | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &bpacking::unpack_neon<bool>) | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &bpacking::unpack_neon<uint8_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &bpacking::unpack_neon<uint16_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &bpacking::unpack_neon<uint32_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<uint64_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| BENCHMARK_UNPACK_ALL_TYPES(NeonUnaligned, false, bpacking::unpack_neon); | ||
| #endif | ||
|
|
||
| #if defined(ARROW_HAVE_RUNTIME_SVE128) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if there's an easy way to reduce the duplication we're doing for each runtime SIMD level? For example if we could write something like: BENCHMARK_SIMD_UNPACK(Bool, bool, SVE128, Sve128, sve128);and it would expand to: BENCHMARK_CAPTURE(BM_UnpackBool, Sve128Unaligned, false, &bpacking::unpack_sve128<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128),
"Sve128 not available")
->ArgsProduct(kBitWidthsNumValues<bool>);
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean with a macro?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! |
||
| BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve128Unaligned, false, bpacking::unpack_sve128, | ||
| SVE128, "Sve128 not available"); | ||
| #endif | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>) | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
| BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>) | ||
| ->ArgsProduct(kBitWidthsNumValuesBool); | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues8); | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues16); | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues32); | ||
|
|
||
| BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>) | ||
| ->ArgsProduct(kBitWidthsNumValues64); | ||
| #if defined(ARROW_HAVE_RUNTIME_SVE256) | ||
| BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve256Unaligned, false, bpacking::unpack_sve256, | ||
| SVE256, "Sve256 not available"); | ||
| #endif | ||
|
|
||
| BENCHMARK_UNPACK_ALL_TYPES(DynamicAligned, true, unpack); | ||
| BENCHMARK_UNPACK_ALL_TYPES(DynamicUnaligned, false, unpack); | ||
|
|
||
| #undef BENCHMARK_UNPACK_ALL_TYPES_RUNTIME | ||
| #undef BENCHMARK_UNPACK_ALL_TYPES | ||
| #undef BENCHMARK_UNPACK_ALL_TYPES_SKIP | ||
|
|
||
| } // namespace | ||
| } // namespace arrow::internal | ||
Uh oh!
There was an error while loading. Please reload this page.