diff --git a/hwy/contrib/algo/minmax-inl.h b/hwy/contrib/algo/minmax-inl.h index 6020d16d2b..fc434cd24c 100644 --- a/hwy/contrib/algo/minmax-inl.h +++ b/hwy/contrib/algo/minmax-inl.h @@ -14,6 +14,8 @@ // limitations under the License. // Per-target include guard +#include +#include #if defined(HIGHWAY_HWY_CONTRIB_ALGO_MINMAX_INL_H_) == \ defined(HWY_TARGET_TOGGLE) // NOLINT #ifdef HIGHWAY_HWY_CONTRIB_ALGO_MINMAX_INL_H_ @@ -28,7 +30,8 @@ HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { -// Returns the minimum value in `in[0, count)` or PositiveInfOrHighestValue() if count == 0. +// Returns the minimum value in `in[0, count)` or +// PositiveInfOrHighestValue() if count == 0. template > T MinValue(D d, const T* HWY_RESTRICT in, size_t count) { const size_t N = Lanes(d); @@ -61,7 +64,8 @@ T MinValue(D d, const T* HWY_RESTRICT in, size_t count) { return ReduceMin(d, acc0); } -// Returns the maximum value in `in[0, count)` or NegativeInfOrLowestValue() if count == 0. +// Returns the maximum value in `in[0, count)` or +// NegativeInfOrLowestValue() if count == 0. template > T MaxValue(D d, const T* HWY_RESTRICT in, size_t count) { const size_t N = Lanes(d); @@ -94,6 +98,168 @@ T MaxValue(D d, const T* HWY_RESTRICT in, size_t count) { return ReduceMax(d, acc0); } +template , + class DF2 = CappedTag, class VF2 = Vec, + class VF = Vec, class HRed, class VRed> +static HWY_INLINE std::pair Reduce2X(DF df, VF x_0, VF x_1, HRed h_red, + VRed v_red) { + const DF2 df2; + if constexpr (MaxLanes(df) < 2) { + return {h_red(df, x_0), h_red(df, x_1)}; + } else { + constexpr size_t kMaxLanes = MaxLanes(df); + HWY_LANES_CONSTEXPR size_t kLanes = Lanes(df); + HWY_ALIGN T x_transposed[2 * kMaxLanes]; + StoreInterleaved2(x_0, x_1, df, x_transposed); + VF x01 = v_red(Load(df, x_transposed), Load(df, x_transposed + kLanes)); + Store(x01, df, x_transposed); + + VF2 result = Load(df2, x_transposed); + for (size_t i = 1; i < kLanes / 2; ++i) { + result = v_red(result, Load(df2, x_transposed + i * 2)); + } + HWY_ALIGN T temp[2]; + Store(result, df2, temp); + return {temp[0], temp[1]}; + } +} + +template , + class DF4 = CappedTag, class VF4 = Vec, + class VF = Vec, class HRed, class VRed> +static HWY_INLINE VF4 Reduce4X(DF df, VF x_0, VF x_1, VF x_2, VF x_3, + HRed h_red, VRed v_red) { + const DF4 df4; + if constexpr ((HWY_TARGET & (HWY_ALL_NEON | HWY_ALL_SVE | HWY_RVV)) != 0 || + MaxLanes(df) < 4) { + HWY_ALIGN T temp[4] = {h_red(df, x_0), h_red(df, x_1), h_red(df, x_2), + h_red(df, x_3)}; + return Load(df4, temp); + } else { + constexpr size_t kMaxLanes = MaxLanes(df); + HWY_LANES_CONSTEXPR size_t kLanes = Lanes(df); + HWY_ALIGN T x_transposed[4 * kMaxLanes]; + StoreInterleaved4(x_0, x_1, x_2, x_3, df, x_transposed); + VF x01 = v_red(Load(df, x_transposed), Load(df, x_transposed + kLanes)); + VF x23 = v_red(Load(df, x_transposed + 2 * kLanes), + Load(df, x_transposed + 3 * kLanes)); + VF x0123 = v_red(x01, x23); + Store(x0123, df, x_transposed); + + VF4 result = Load(df4, x_transposed); + for (size_t i = 1; i < kLanes / 4; ++i) { + result = v_red(result, Load(df4, x_transposed + i * 4)); + } + return result; + } +} + +template , + class DF8 = CappedTag, class VF8 = Vec, + class VF = Vec, class HRed, class VRed> +static HWY_INLINE VF8 Reduce8X(DF df, VF x_0, VF x_1, VF x_2, VF x_3, VF x_4, + VF x_5, VF x_6, VF x_7, HRed h_red, VRed v_red) { + const DF8 df8; + if constexpr ((HWY_TARGET & (HWY_ALL_NEON | HWY_ALL_SVE | HWY_RVV)) != 0 || + MaxLanes(df) < 8) { + HWY_ALIGN T temp[8] = {h_red(df, x_0), h_red(df, x_1), h_red(df, x_2), + h_red(df, x_3), h_red(df, x_4), h_red(df, x_5), + h_red(df, x_6), h_red(df, x_7)}; + return Load(df8, temp); + } else { + auto res0123 = Reduce4X(df, x_0, x_1, x_2, x_3, h_red, v_red); + auto res4567 = Reduce4X(df, x_4, x_5, x_6, x_7, h_red, v_red); + + return Combine(df8, res4567, res0123); + } +} + + + +template , class VF = Vec> +static HWY_INLINE std::pair Reduce2Min(DF df, VF x_0, VF x_1) { + return Reduce2X( + df, x_0, x_1, [](auto d, auto v) HWY_ATTR { return ReduceMin(d, v); }, + [](auto a, auto b) HWY_ATTR { return Min(a, b); }); +} + +template , class VF = Vec> +static HWY_INLINE std::pair Reduce2Max(DF df, VF x_0, VF x_1) { + return Reduce2X( + df, x_0, x_1, [](auto d, auto v) HWY_ATTR { return ReduceMax(d, v); }, + [](auto a, auto b) HWY_ATTR { return Max(a, b); }); +} + +template , class VF = Vec> +static HWY_INLINE std::pair Reduce2Sum(DF df, VF x_0, VF x_1) { + return Reduce2X( + df, x_0, x_1, [](auto d, auto v) HWY_ATTR { return ReduceSum(d, v); }, + [](auto a, auto b) HWY_ATTR { return Add(a, b); }); +} + +template , + class DF4 = CappedTag, class VF4 = Vec, + class VF = Vec> +static HWY_INLINE VF4 Reduce4Min(DF df, VF x_0, VF x_1, VF x_2, VF x_3) { + return Reduce4X( + df, x_0, x_1, x_2, x_3, + [](auto d, auto v) HWY_ATTR { return ReduceMin(d, v); }, + [](auto a, auto b) HWY_ATTR { return Min(a, b); }); +} + +template , + class DF4 = CappedTag, class VF4 = Vec, + class VF = Vec> +static HWY_INLINE VF4 Reduce4Max(DF df, VF x_0, VF x_1, VF x_2, VF x_3) { + return Reduce4X( + df, x_0, x_1, x_2, x_3, + [](auto d, auto v) HWY_ATTR { return ReduceMax(d, v); }, + [](auto a, auto b) HWY_ATTR { return Max(a, b); }); +} + +template , + class DF4 = CappedTag, class VF4 = Vec, + class VF = Vec> +static HWY_INLINE VF4 Reduce4Sum(DF df, VF x_0, VF x_1, VF x_2, VF x_3) { + return Reduce4X( + df, x_0, x_1, x_2, x_3, + [](auto d, auto v) HWY_ATTR { return ReduceSum(d, v); }, + [](auto a, auto b) HWY_ATTR { return Add(a, b); }); +} + +template , + class DF8 = CappedTag, class VF8 = Vec, + class VF = Vec> +static HWY_INLINE VF8 Reduce8Min(DF df, VF x_0, VF x_1, VF x_2, VF x_3, VF x_4, + VF x_5, VF x_6, VF x_7) { + return Reduce8X( + df, x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7, + [](auto d, auto v) HWY_ATTR { return ReduceMin(d, v); }, + [](auto a, auto b) HWY_ATTR { return Min(a, b); }); +} + +template , + class DF8 = CappedTag, class VF8 = Vec, + class VF = Vec> +static HWY_INLINE VF8 Reduce8Max(DF df, VF x_0, VF x_1, VF x_2, VF x_3, VF x_4, + VF x_5, VF x_6, VF x_7) { + return Reduce8X( + df, x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7, + [](auto d, auto v) HWY_ATTR { return ReduceMax(d, v); }, + [](auto a, auto b) HWY_ATTR { return Max(a, b); }); +} + +template , + class DF8 = CappedTag, class VF8 = Vec, + class VF = Vec> +static HWY_INLINE VF8 Reduce8Sum(DF df, VF x_0, VF x_1, VF x_2, VF x_3, VF x_4, + VF x_5, VF x_6, VF x_7) { + return Reduce8X( + df, x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7, + [](auto d, auto v) HWY_ATTR { return ReduceSum(d, v); }, + [](auto a, auto b) HWY_ATTR { return Add(a, b); }); +} + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/contrib/algo/minmax_value_test.cc b/hwy/contrib/algo/minmax_value_test.cc index 3c796a8e8e..2f10e8ce42 100644 --- a/hwy/contrib/algo/minmax_value_test.cc +++ b/hwy/contrib/algo/minmax_value_test.cc @@ -146,6 +146,101 @@ void TestAllMaxValue() { ForAllTypes(ForPartialVectors>()); } +struct TestReduce { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + const size_t N = Lanes(d); + AlignedFreeUniquePtr storage = AllocateAligned(8 * N); + HWY_ASSERT(storage); + T* in = storage.get(); + for (size_t i = 0; i < 8 * N; ++i) { + in[i] = static_cast((i % 13) + 1); + } + + auto x0 = Load(d, in + 0 * N); + auto x1 = Load(d, in + 1 * N); + auto x2 = Load(d, in + 2 * N); + auto x3 = Load(d, in + 3 * N); + auto x4 = Load(d, in + 4 * N); + auto x5 = Load(d, in + 5 * N); + auto x6 = Load(d, in + 6 * N); + auto x7 = Load(d, in + 7 * N); + + T min_s[8]; + T max_s[8]; + T sum_s[8]; + for (int v = 0; v < 8; ++v) { + min_s[v] = hwy::PositiveInfOrHighestValue(); + max_s[v] = hwy::NegativeInfOrLowestValue(); + sum_s[v] = 0; + for (size_t i = 0; i < N; ++i) { + T val = in[v * N + i]; + if (val < min_s[v]) min_s[v] = val; + if (val > max_s[v]) max_s[v] = val; + sum_s[v] = AddWithWraparound(sum_s[v], val); + } + } + + using D2 = CappedTag; + const D2 d2; + if (Lanes(d2) == 2) { + HWY_ALIGN T actual_min[2]; + HWY_ALIGN T actual_max[2]; + HWY_ALIGN T actual_sum[2]; + auto p_min = Reduce2Min(d, x0, x1); + actual_min[0] = p_min.first; + actual_min[1] = p_min.second; + auto p_max = Reduce2Max(d, x0, x1); + actual_max[0] = p_max.first; + actual_max[1] = p_max.second; + auto p_sum = Reduce2Sum(d, x0, x1); + actual_sum[0] = p_sum.first; + actual_sum[1] = p_sum.second; + for (int i = 0; i < 2; ++i) { + HWY_ASSERT_EQ(min_s[i], actual_min[i]); + HWY_ASSERT_EQ(max_s[i], actual_max[i]); + HWY_ASSERT_EQ(sum_s[i], actual_sum[i]); + } + } + + using D4 = CappedTag; + const D4 d4; + if (Lanes(d4) == 4) { + HWY_ALIGN T actual_min[4]; + HWY_ALIGN T actual_max[4]; + HWY_ALIGN T actual_sum[4]; + Store(Reduce4Min(d, x0, x1, x2, x3), d4, actual_min); + Store(Reduce4Max(d, x0, x1, x2, x3), d4, actual_max); + Store(Reduce4Sum(d, x0, x1, x2, x3), d4, actual_sum); + for (int i = 0; i < 4; ++i) { + HWY_ASSERT_EQ(min_s[i], actual_min[i]); + HWY_ASSERT_EQ(max_s[i], actual_max[i]); + HWY_ASSERT_EQ(sum_s[i], actual_sum[i]); + } + } + + using D8 = CappedTag; + const D8 d8; + if (Lanes(d8) == 8) { + HWY_ALIGN T actual_min[8]; + HWY_ALIGN T actual_max[8]; + HWY_ALIGN T actual_sum[8]; + Store(Reduce8Min(d, x0, x1, x2, x3, x4, x5, x6, x7), d8, actual_min); + Store(Reduce8Max(d, x0, x1, x2, x3, x4, x5, x6, x7), d8, actual_max); + Store(Reduce8Sum(d, x0, x1, x2, x3, x4, x5, x6, x7), d8, actual_sum); + for (int i = 0; i < 8; ++i) { + HWY_ASSERT_EQ(min_s[i], actual_min[i]); + HWY_ASSERT_EQ(max_s[i], actual_max[i]); + HWY_ASSERT_EQ(sum_s[i], actual_sum[i]); + } + } + } +}; + +void TestAllReduce() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -158,6 +253,7 @@ namespace { HWY_BEFORE_TEST(MinMaxTest); HWY_EXPORT_AND_TEST_P(MinMaxTest, TestAllMinValue); HWY_EXPORT_AND_TEST_P(MinMaxTest, TestAllMaxValue); +HWY_EXPORT_AND_TEST_P(MinMaxTest, TestAllReduce); HWY_AFTER_TEST(); } // namespace } // namespace hwy