Skip to content

Commit 40d40f8

Browse files
committed
Add bpacking Sve 256 dynamic dispatch
1 parent 2e209a4 commit 40d40f8

File tree

7 files changed

+138
-59
lines changed

7 files changed

+138
-59
lines changed

cpp/src/arrow/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ set(ARROW_UTIL_SRCS
519519
util/bitmap_ops.cc
520520
util/bpacking.cc
521521
util/bpacking_scalar.cc
522-
util/bpacking_simd_default.cc
522+
util/bpacking_simd_128.cc
523523
util/byte_size.cc
524524
util/byte_stream_split_internal.cc
525525
util/cancel.cc
@@ -564,9 +564,11 @@ set(ARROW_UTIL_SRCS
564564

565565
append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc)
566566

567-
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
567+
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
568568
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)
569569

570+
append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
571+
570572
if(ARROW_WITH_BROTLI)
571573
list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
572574
endif()

cpp/src/arrow/util/bpacking.cc

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,26 @@ struct UnpackDynamicFunction {
3333

3434
static constexpr auto implementations() {
3535
return std::array{
36+
// x86 implementations
3637
#if defined(ARROW_HAVE_SSE4_2)
3738
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
38-
#else
39-
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
40-
#endif
41-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
39+
# if defined(ARROW_HAVE_RUNTIME_AVX2)
4240
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
43-
#endif
44-
#if defined(ARROW_HAVE_RUNTIME_AVX512)
41+
# endif
42+
# if defined(ARROW_HAVE_RUNTIME_AVX512)
4543
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
44+
# endif
45+
46+
// ARM implementations
47+
#elif defined(ARROW_HAVE_NEON)
48+
Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>},
49+
# if defined(ARROW_HAVE_RUNTIME_SVE256)
50+
Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>},
51+
# endif
52+
53+
// Other implementations
54+
#else
55+
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
4656
#endif
4757
};
4858
}
@@ -52,12 +62,14 @@ struct UnpackDynamicFunction {
5262

5363
template <typename Uint>
5464
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
55-
#if defined(ARROW_HAVE_NEON)
56-
return bpacking::unpack_neon(in, out, opts);
57-
#else
58-
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
59-
return dispatch.func(in, out, opts);
60-
#endif
65+
auto constexpr kImplementations = UnpackDynamicFunction<Uint>::implementations();
66+
if constexpr (kImplementations.size() == 1) {
67+
constexpr auto func = kImplementations.front().second;
68+
func(in, out, opts);
69+
} else {
70+
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
71+
return dispatch.func(in, out, opts);
72+
}
6173
}
6274

6375
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);

cpp/src/arrow/util/bpacking_benchmark.cc

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include "arrow/util/bpacking_scalar_internal.h"
2727
#include "arrow/util/bpacking_simd_internal.h"
2828

29-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
29+
#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE256)
3030
# include "arrow/util/cpu_info.h"
3131
#endif
3232

@@ -254,6 +254,33 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<
254254
->ArgsProduct(kBitWidthsNumValues64);
255255
#endif
256256

257+
#if defined(ARROW_HAVE_RUNTIME_SVE256)
258+
BENCHMARK_CAPTURE(BM_UnpackBool, Sve256Unaligned, false, &bpacking::unpack_sve256<bool>,
259+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
260+
"Sve256 not available")
261+
->ArgsProduct(kBitWidthsNumValuesBool);
262+
BENCHMARK_CAPTURE(BM_UnpackUint8, Sve256Unaligned, false,
263+
&bpacking::unpack_sve256<uint8_t>,
264+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
265+
"Sve256 not available")
266+
->ArgsProduct(kBitWidthsNumValues8);
267+
BENCHMARK_CAPTURE(BM_UnpackUint16, Sve256Unaligned, false,
268+
&bpacking::unpack_sve256<uint16_t>,
269+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
270+
"Sve256 not available")
271+
->ArgsProduct(kBitWidthsNumValues16);
272+
BENCHMARK_CAPTURE(BM_UnpackUint32, Sve256Unaligned, false,
273+
&bpacking::unpack_sve256<uint32_t>,
274+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
275+
"Sve256 not available")
276+
->ArgsProduct(kBitWidthsNumValues32);
277+
BENCHMARK_CAPTURE(BM_UnpackUint64, Sve256Unaligned, false,
278+
&bpacking::unpack_sve256<uint64_t>,
279+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256),
280+
"Sve256 not available")
281+
->ArgsProduct(kBitWidthsNumValues64);
282+
#endif
283+
257284
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
258285
->ArgsProduct(kBitWidthsNumValuesBool);
259286
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
File renamed without changes.
Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,36 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
#include "arrow/util/bpacking_dispatch_internal.h"
19-
#include "arrow/util/bpacking_internal.h"
20-
#include "arrow/util/bpacking_simd_internal.h"
21-
#include "arrow/util/bpacking_simd_kernel_internal.h"
18+
#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
19+
# define UNPACK_PLATFORM unpack_sve256
20+
#elif defined(ARROW_HAVE_RUNTIME_AVX2)
21+
# define UNPACK_PLATFORM unpack_avx2
22+
#endif
23+
24+
#if defined(UNPACK_PLATFORM)
25+
26+
# include "arrow/util/bpacking_dispatch_internal.h"
27+
# include "arrow/util/bpacking_internal.h"
28+
# include "arrow/util/bpacking_simd_internal.h"
29+
# include "arrow/util/bpacking_simd_kernel_internal.h"
2230

2331
namespace arrow::internal::bpacking {
2432

2533
template <typename UnpackedUint, int kPackedBitSize>
2634
using Simd256Kernel = Kernel<UnpackedUint, kPackedBitSize, 256>;
2735

2836
template <typename Uint>
29-
void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
37+
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
3038
return unpack_jump<Simd256Kernel>(in, out, opts);
3139
}
3240

33-
template void unpack_avx2<bool>(const uint8_t*, bool*, const UnpackOptions&);
34-
template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
35-
template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
36-
template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
37-
template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);
41+
template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const UnpackOptions&);
42+
template void UNPACK_PLATFORM<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
43+
template void UNPACK_PLATFORM<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
44+
template void UNPACK_PLATFORM<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
45+
template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);
3846

3947
} // namespace arrow::internal::bpacking
48+
49+
# undef UNPACK_PLATFORM
50+
#endif // UNPACK_PLATFORM

cpp/src/arrow/util/bpacking_simd_internal.h

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -25,68 +25,62 @@
2525
namespace arrow::internal::bpacking {
2626

2727
#if defined(ARROW_HAVE_NEON)
28-
29-
template <typename Uint>
30-
ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, const UnpackOptions& opts);
31-
32-
extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>( //
33-
const uint8_t* in, bool* out, const UnpackOptions& opts);
34-
35-
extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(
36-
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
37-
38-
extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(
39-
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
40-
41-
extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(
42-
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
43-
44-
extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(
45-
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
46-
28+
# define UNPACK_ARCH128 unpack_neon
4729
#elif defined(ARROW_HAVE_SSE4_2)
30+
# define UNPACK_ARCH128 unpack_sse4_2
31+
#endif
32+
33+
#if defined(UNPACK_ARCH128)
4834

4935
template <typename Uint>
50-
ARROW_EXPORT void unpack_sse4_2(const uint8_t* in, Uint* out, const UnpackOptions& opts);
36+
ARROW_EXPORT void UNPACK_ARCH128(const uint8_t* in, Uint* out, const UnpackOptions& opts);
5137

52-
extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<bool>( //
38+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<bool>( //
5339
const uint8_t* in, bool* out, const UnpackOptions& opts);
5440

55-
extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint8_t>(
41+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint8_t>(
5642
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
5743

58-
extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint16_t>(
44+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint16_t>(
5945
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
6046

61-
extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint32_t>(
47+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint32_t>(
6248
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
6349

64-
extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint64_t>(
50+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint64_t>(
6551
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
6652

53+
#endif // UNPACK_ARCH128
54+
#undef UNPACK_ARCH128
55+
56+
#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
57+
# define UNPACK_ARCH256 unpack_sve256
58+
#elif defined(UNPACK_ARCH256) || defined(ARROW_HAVE_RUNTIME_AVX2)
59+
# define UNPACK_ARCH256 unpack_avx2
6760
#endif
6861

69-
#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
62+
#if defined(UNPACK_ARCH256)
7063

7164
template <typename Uint>
72-
ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts);
65+
ARROW_EXPORT void UNPACK_ARCH256(const uint8_t* in, Uint* out, const UnpackOptions& opts);
7366

74-
extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>( //
67+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<bool>( //
7568
const uint8_t* in, bool* out, const UnpackOptions& opts);
7669

77-
extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(
70+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint8_t>(
7871
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
7972

80-
extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(
73+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint16_t>(
8174
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
8275

83-
extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(
76+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint32_t>(
8477
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
8578

86-
extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(
79+
extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint64_t>(
8780
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
8881

89-
#endif
82+
#endif // UNPACK_ARCH256
83+
#undef UNPACK_ARCH256
9084

9185
#if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
9286

cpp/src/arrow/util/bpacking_test.cc

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include "arrow/util/bpacking_scalar_internal.h"
2828
#include "arrow/util/bpacking_simd_internal.h"
2929

30-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
30+
#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE256)
3131
# include "arrow/util/cpu_info.h"
3232
#endif
3333

@@ -349,6 +349,39 @@ TEST_P(TestUnpack, Unpack32Neon) { this->TestAll(&bpacking::unpack_neon<uint32_t
349349
TEST_P(TestUnpack, Unpack64Neon) { this->TestAll(&bpacking::unpack_neon<uint64_t>); }
350350
#endif
351351

352+
#if defined(ARROW_HAVE_RUNTIME_SVE256)
353+
TEST_P(TestUnpack, UnpackBoolSve256) {
354+
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
355+
GTEST_SKIP() << "Test requires SVE256";
356+
}
357+
this->TestAll(&bpacking::unpack_sve256<bool>);
358+
}
359+
TEST_P(TestUnpack, Unpack8Sve256) {
360+
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
361+
GTEST_SKIP() << "Test requires SVE256";
362+
}
363+
this->TestAll(&bpacking::unpack_sve256<uint8_t>);
364+
}
365+
TEST_P(TestUnpack, Unpack16Sve256) {
366+
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
367+
GTEST_SKIP() << "Test requires SVE256";
368+
}
369+
this->TestAll(&bpacking::unpack_sve256<uint16_t>);
370+
}
371+
TEST_P(TestUnpack, Unpack32Sve256) {
372+
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
373+
GTEST_SKIP() << "Test requires SVE256";
374+
}
375+
this->TestAll(&bpacking::unpack_sve256<uint32_t>);
376+
}
377+
TEST_P(TestUnpack, Unpack64Sve256) {
378+
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
379+
GTEST_SKIP() << "Test requires SVE256";
380+
}
381+
this->TestAll(&bpacking::unpack_sve256<uint64_t>);
382+
}
383+
#endif
384+
352385
TEST_P(TestUnpack, UnpackBool) { this->TestAll(&unpack<bool>); }
353386
TEST_P(TestUnpack, Unpack8) { this->TestAll(&unpack<uint8_t>); }
354387
TEST_P(TestUnpack, Unpack16) { this->TestAll(&unpack<uint16_t>); }

0 commit comments

Comments
 (0)