From 06a765cc72651997553500247079bb237e6fb2bf Mon Sep 17 00:00:00 2001 From: Highway Date: Wed, 10 Jun 2026 11:45:29 -0700 Subject: [PATCH] N/A PiperOrigin-RevId: 929984949 --- BUILD | 67 ++- hwy/cache_control.h | 38 +- hwy/contrib/pipeline/prefetch_args.h | 114 +++++ hwy/contrib/pipeline/prefetch_pipeline.h | 393 ++++++++++++++++++ hwy/contrib/pipeline/prefetch_pipeline_2d.h | 141 +++++++ .../pipeline/prefetch_pipeline_test.cc | 248 +++++++++++ hwy_tests.bzl | 7 + 7 files changed, 985 insertions(+), 23 deletions(-) create mode 100644 hwy/contrib/pipeline/prefetch_args.h create mode 100644 hwy/contrib/pipeline/prefetch_pipeline.h create mode 100644 hwy/contrib/pipeline/prefetch_pipeline_2d.h create mode 100644 hwy/contrib/pipeline/prefetch_pipeline_test.cc diff --git a/BUILD b/BUILD index 765546b987..c552a373f1 100644 --- a/BUILD +++ b/BUILD @@ -4,6 +4,7 @@ load("@rules_cc//cc:cc_test.bzl", "cc_test") load("@bazel_skylib//lib:selects.bzl", "selects") load("//:hwy_tests.bzl", "HWY_TESTS") load("@rules_license//rules:license.bzl", "license") +load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library") package( default_applicable_licenses = [":license"], @@ -118,6 +119,27 @@ COPTS = select({ ], }) +HWY_TEST_COPTS = select({ + ":compiler_msvc": [], + "//conditions:default": [ + # gTest triggers this warning (which is enabled by the + # extra-semi in COPTS), so we need to disable it here, + # but it's still enabled for :hwy. + "-Wno-c++98-compat-extra-semi", + ], +}) + +# Common to all tests. +HWY_TEST_DEPS = [ + ":hwy_test_util", + ":hwy", + ":nanobenchmark", + ":timer", +] + select({ + ":compiler_msvc": [], + "//conditions:default": ["@com_google_googletest//:gtest_main"], +}) + DEFINES = select({ ":compiler_msvc": ["HWY_SHARED_DEFINE"], ":compiler_clangcl": ["HWY_SHARED_DEFINE"], @@ -606,6 +628,23 @@ cc_library( ], ) +# copybara:strip_begin(internal) +cc_library( + name = "prefetch_pipeline", + hdrs = [ + "hwy/contrib/pipeline/prefetch_args.h", + "hwy/contrib/pipeline/prefetch_pipeline.h", + "hwy/contrib/pipeline/prefetch_pipeline_2d.h", + ], + compatible_with = [], + copts = COPTS, + deps = [ + ":hwy", + ":timer", + ], +) +# copybara:strip_end + cc_test( name = "list_targets", size = "small", @@ -627,27 +666,6 @@ cc_test( ], ) -HWY_TEST_COPTS = select({ - ":compiler_msvc": [], - "//conditions:default": [ - # gTest triggers this warning (which is enabled by the - # extra-semi in COPTS), so we need to disable it here, - # but it's still enabled for :hwy. - "-Wno-c++98-compat-extra-semi", - ], -}) - -# Common to all tests. -HWY_TEST_DEPS = [ - ":hwy_test_util", - ":hwy", - ":nanobenchmark", - ":timer", -] + select({ - ":compiler_msvc": [], - "//conditions:default": ["@com_google_googletest//:gtest_main"], -}) - [ [ cc_test( @@ -713,3 +731,10 @@ test_suite( name = "hwy_ops_tests", tags = ["hwy_ops_test"], ) + +bzl_library( + name = "hwy_tests_bzl", + srcs = ["hwy_tests.bzl"], + parse_tests = False, + visibility = ["//visibility:private"], +) diff --git a/hwy/cache_control.h b/hwy/cache_control.h index 90743cd3f2..0df23e17cc 100644 --- a/hwy/cache_control.h +++ b/hwy/cache_control.h @@ -92,8 +92,10 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() { #endif } -// Optionally begins loading the cache line containing "p" to reduce latency of -// subsequent actual loads. +// Optionally begins loading the cache line containing "p" into all cache +// levels, including L1, to reduce latency of subsequent actual loads. This +// corresponds to the T0 temporal locality hint on x86, which is ideal when data +// is about to be directly consumed. template HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { (void)p; @@ -109,6 +111,38 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { #endif // HWY_DISABLE_CACHE_CONTROL } +// Begins loading the cache line containing "p" into the L1 cache only, passing +// a Non-Temporal Access (NTA) hint. This minimizes pollution of outer memory +// caches (L2/L3) and is ideal for data accessed exactly once. +template +HWY_INLINE HWY_ATTR_CACHE void ShallowPrefetch(const T* p) { + (void)p; +#ifndef HWY_DISABLE_CACHE_CONTROL +#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__)) + _mm_prefetch(reinterpret_cast(p), _MM_HINT_NTA); +#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang + // Hint=0 specifically sets Non-Temporal local locality + __builtin_prefetch(p, /*write=*/0, /*hint=*/0); +#endif +#endif // HWY_DISABLE_CACHE_CONTROL +} + +// Attempts to stage the cache line containing "p" into the L3/L2 outer caches +// without aggressively staging it immediately into the L1. This restricts L1 +// and LFB thrashing on architectures like Intel when hiding massive DRAM delay. +template +HWY_INLINE HWY_ATTR_CACHE void DeepPrefetch(const T* p) { + (void)p; +#ifndef HWY_DISABLE_CACHE_CONTROL +#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__)) + _mm_prefetch(reinterpret_cast(p), _MM_HINT_T2); +#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang + // Hint=1 requests Moderate degrees of temporal locality (L2/L3 bounds) + __builtin_prefetch(p, /*write=*/0, /*hint=*/1); +#endif +#endif // HWY_DISABLE_CACHE_CONTROL +} + // Invalidates and flushes the cache line containing "p", if possible. HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) diff --git a/hwy/contrib/pipeline/prefetch_args.h b/hwy/contrib/pipeline/prefetch_args.h new file mode 100644 index 0000000000..650f072c3f --- /dev/null +++ b/hwy/contrib/pipeline/prefetch_args.h @@ -0,0 +1,114 @@ +// Copyright 2026 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_ +#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_ + +#include +#include + +namespace hwy { + +struct PrefetchArgs { + // The iteration distance (in loop iterations) to look ahead for the deep L3 + // prefetch. If 0, no deep prefetch will be issued. + size_t deep_lookahead = 32; + + // The iteration distance (in loop iterations) to look ahead for the shallow + // L1 prefetch. If 0, no shallow prefetch will be issued. + size_t shallow_lookahead = 4; + + // ------------------------------------------------------------------------- + // Telemetry Conduit + // ------------------------------------------------------------------------- + // A generic callback executed upon pipeline completion to report performance + // metrics (typically elapsed time). Its signature avoids std::function to + // maintain zero-overhead C-style linkage and ensure the struct remains + // trivially copyable without heap allocations. + // user_data: Custom context pointer returned to the callback. + // elapsed_ticks: Raw cycle ticks taken to evaluate the pipeline loop. + void (*metric_collector_cb)(void* user_data, + uint64_t elapsed_ticks) = nullptr; + void* user_data = nullptr; + + // ------------------------------------------------------------------------- + // Safe Default Factories + // ------------------------------------------------------------------------- + // Tuning memory prefetching is notoriously difficult because lookahead bounds + // change dramatically depending on the spatial distribution of the workload. + + // Random Access / Scatter-Gather (e.g. Hash Table Probing, Graph Walks) + // + // Random array accesses constantly suffer TLB (Translation Lookaside Buffer) + // misses, resulting in massive Page Walk delays. To absorb these colossal + // ~300-cycle stalls natively inside the L3 queue, the deep lookahead must + // aggressively stretch out by large margins (e.g. 32-48 iterations). + static constexpr PrefetchArgs DefaultRandom() { +#if HWY_ARCH_ARM_A64 + return PrefetchArgs{.deep_lookahead = 64, .shallow_lookahead = 8}; +#else + return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4}; +#endif + } + + // Sequential Scans / Linear Memory (e.g. Matrix Vector, Filter Scans) + // + // Linear accesses benefit intimately from native CPU stream-trackers (which + // already mask bulk DRAM latency). Here, a heavy L3 lookahead is + // counter-productive; it merely crowds the queue. Instead, we tighten the + // lookaheads down to safely bridge the narrower L3 -> L1 latency gap + // (~40 cycles) without overflowing LFBs during heavy SIMD evaluation. + static constexpr PrefetchArgs DefaultSequential() { +#if HWY_ARCH_ARM_A64 + return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4}; +#else + return PrefetchArgs{.deep_lookahead = 8, .shallow_lookahead = 2}; +#endif + } +}; + +// --------------------------------------------------------------------------- +// 2D-Tiled Prefetch Policy +// --------------------------------------------------------------------------- +// Extends the base PrefetchPolicy to include standard 2D tiling constants. +struct Prefetch2DArgs { + PrefetchArgs prefetch; + size_t outer_block = 128; + size_t inner_block = 256; + + // ------------------------------------------------------------------------- + // Safe Default Factories + // ------------------------------------------------------------------------- + + static constexpr Prefetch2DArgs DefaultRandom() { + Prefetch2DArgs args; + args.prefetch = PrefetchArgs::DefaultRandom(); + args.outer_block = 128; + args.inner_block = 256; + return args; + } + + static constexpr Prefetch2DArgs DefaultSequential() { + Prefetch2DArgs args; + args.prefetch = PrefetchArgs::DefaultSequential(); + args.outer_block = 256; + args.inner_block = 512; + return args; + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_ diff --git a/hwy/contrib/pipeline/prefetch_pipeline.h b/hwy/contrib/pipeline/prefetch_pipeline.h new file mode 100644 index 0000000000..b7342e4b97 --- /dev/null +++ b/hwy/contrib/pipeline/prefetch_pipeline.h @@ -0,0 +1,393 @@ +// Copyright 2026 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_ +#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_ + +#include + +#include "hwy/base.h" +#include "hwy/cache_control.h" +#include "hwy/contrib/pipeline/prefetch_args.h" +#include "hwy/timer.h" + +namespace hwy { + +// --------------------------------------------------------------------------- +// PrefetchStrategy +// --------------------------------------------------------------------------- +// Enumerates the structural looping algorithm that the pipeline will compile +// down into. Used to decouple hardware limits from loop execution mechanics. +enum class PrefetchStrategy { + kNoPrefetch, + kDeepLookaheadOnly, + kShallowLookaheadOnly, + kMiniBatchDeep, + kMiniBatchShallow, + kDualTier +}; + +// --------------------------------------------------------------------------- +// PrefetchLimits +// --------------------------------------------------------------------------- +// Provides hardware limitation policies for the pipelining loop to optimize +// execution. A default policy is provided via `DefaultPrefetchLimits`, but +// users can construct their own via template specializations. +struct DefaultPrefetchLimits { + // The maximum number of explicit cachelines that should be prefetched per + // iteration (i.e., cachelines containing the data to be used by the upcoming + // compute task). + static constexpr size_t kMaxCachelinesPerIter = 4; + + // ------------------------------------------------------------------------- + // Hardware Architecture Matrix + // ------------------------------------------------------------------------- + // Resolved automatically within the struct using HWY_ARCH_* + +#if HWY_ARCH_ARM || HWY_ARCH_ARM_A64 + // ARM platforms vary by their natures, we use 24 here to be conservative. + static constexpr size_t kNumMSHRs = 24; + +#elif HWY_ARCH_X86 + // On x86, we cannot differentiate Intel vs AMD reliably at compile time. + // Because hitting the edge of the Intel LFB pool (10-12) causes a hard CPU + // stall, we MUST gracefully pick the most restrictive denominator (12) to + // ensure safety. AMD Zen architectures uniquely "absorb" the excess prefetch + // instructions into their 124 MABs, suffering zero penalty from this wrapper. + static constexpr size_t kNumMSHRs = 12; + +#else + // Safe, generic bounds. + static constexpr size_t kNumMSHRs = 12; +#endif + + // The specific execution strategy the pipeline will adopt. + static constexpr PrefetchStrategy kStrategy = PrefetchStrategy::kDualTier; +}; + +// --------------------------------------------------------------------------- +// PrefetchCachelines +// --------------------------------------------------------------------------- +// A lightweight, stack-allocated, fixed-capacity container for collecting +// discrete memory addresses to be prefetched. +// By strictly accumulating memory pointers individually, it allows precise +// control over the Line Fill Buffer (LFB) utilization in the pipelining loop. +template +struct PrefetchCachelines { + // Array of explicit memory addresses to prefetch. + const void* ptrs[kMaxCachelinesPerIter]; + + // The number of valid pointers currently registered in the array. + size_t count = 0; + + // Registers a discrete memory address to be prefetched. + // The user should supply the base pointer of the data they intend to access. + HWY_INLINE void Add(const void* ptr) { + HWY_DASSERT(count < kMaxCachelinesPerIter); + ptrs[count] = ptr; + ++count; + } +}; + +#if defined(__cpp_concepts) && __cpp_concepts >= 201907L +// --------------------------------------------------------------------------- +// Cachelines Provider Concept +// --------------------------------------------------------------------------- +// To use PrefetchPipelineLoop, the user must provide a callable that adheres +// to the following signature: +// +// template +// void operator()(size_t i, PrefetchCachelines& +// cachelines) const; +// +// Parameters: +// - i: The current sequence index evaluated by the loop. +// This is guaranteed to be in the range `[start, end)`. +// - cachelines: Output collection. Call `cachelines.Add(ptr)` to +// register cachelines. For invalid or conditional +// indices, simply do not add anything. +// +// Execution constraints: +// - Depending on the architecture, prefetching optimizations, or runtime +// auto-tuning limits (see Policy configurations below), there is no +// hardware or programmatic guarantee how many times this function will be +// called for a given `i` (including zero times if prefetches are stripped +// via `constexpr`). Therefore, this callable MUST be completely pure and +// strictly side-effect free! +// - Missing or zero-length entries are natively skipped by the pipeline +// without incurring branching overhead. +template +concept PrefetchPipelineCachelineProvider = + requires(const T& provider, size_t i, + PrefetchCachelines& cachelines) { + { provider(i, cachelines) }; + }; + +// --------------------------------------------------------------------------- +// Pipeline Task Concept +// --------------------------------------------------------------------------- +// To use PrefetchPipelineLoop, the user must provide a callable that adheres +// to the following signature: +// +// void operator()(size_t i) const; +// +// Parameters: +// - i: The current sequence index being evaluated by the pipeline. +// +// Execution constraints: +// - Guaranteed to be called exactly once for each index `i` in the range +// `[start, end)`. Furthermore, it is guaranteed to be invoked purely +// sequentially (i.e. `i`, `i+1`, `i+2`), preserving any cross-iteration +// dependencies or internal accumulator state. +template +concept PrefetchPipelineTask = requires(const T& task, size_t i) { + { task(i) }; +}; +#endif + +// PrefetchPipelineLoop +// --------------------------------------------------------------------------- +// Design Philosophy: +// While this helper aims to deeply accelerate predictable memory-bound loops, +// its core tenet is "do no harm in the worst case". By carefully staging +// memory into L3 before migrating it to L1, and rigorously capping the active +// footprint against hardware Line Fill Buffer (LFB) limits, it ensures that +// memory bandwidth is maximized without accidentally thrashing the cache or +// stalling the processor pipelines. +// +// Evaluates a pipelined loop over the range [start, end) with two stages of +// rolling prefetch lookahead: a deep prefetch (e.g. L3) and a shallow prefetch +// (e.g. L1). This correctly stages data transitions from main memory -> L3 -> +// L1 to maximize memory bandwidth and keep CPU Line Fill Buffers from stalling. +// +// Policy: A struct dictating cache limits and loop constants. Defaults to +// `DefaultPrefetchLimits`. +// CachelinesProvider: A callable type matching the signature documented +// in `PrefetchPipelineCachelineProvider` above. It +// resolves cacheline pointers for a given index `i`. +// TaskFn: A callable type matching the signature documented in +// `PrefetchPipelineTask` above. It represents the core evaluation +// logic for index `i`. +// args: A `PrefetchArgs` configuration that controls the deep (L3) and +// shallow (L1) lookahead pipeline distances. Passing a reasonable +// value is crucial for optimal hardware performance. Ideally, use an +// auto-tuned configuration or an explicit architecture preset. +template +#if defined(__cpp_concepts) && __cpp_concepts >= 201907L + requires PrefetchPipelineTask && + PrefetchPipelineCachelineProvider +#endif +HWY_INLINE void PrefetchPipelineLoop(size_t start, size_t end, + const CachelinesProvider& get_cachelines, + const TaskFn& task, + const PrefetchArgs& args) { + const uint64_t t0 = + args.metric_collector_cb != nullptr ? hwy::timer::Start() : 0; + // Gracefully degrade inverted configurations if AutoTune generates them. + size_t actual_shallow = args.shallow_lookahead; + size_t actual_deep = args.deep_lookahead; + HWY_DASSERT(actual_deep == 0 || actual_deep > actual_shallow); + if (actual_shallow >= actual_deep) { + actual_deep = 0; + } + + const size_t initial_shallow_prefetch_end = + HWY_MIN(start + actual_shallow, end); + const size_t initial_deep_prefetch_end = HWY_MIN(start + actual_deep, end); + + // Reusable cache injection loops: + auto execute_deep_prefetch = + [&](const PrefetchCachelines& cachelines) + HWY_ATTR_CACHE { + if (actual_deep == 0) return; + for (size_t r = 0; r < cachelines.count; ++r) { + DeepPrefetchFn(cachelines.ptrs[r]); + } + }; + auto execute_shallow_prefetch = + [&](const PrefetchCachelines& cachelines) + HWY_ATTR_CACHE { +#if HWY_IS_DEBUG_BUILD + // Safeguard: The active L1 hardware queue footprint should not + // exceed the physical Miss Status Holding Registers (MSHRs). + // (e.g. 10-12 Line Fill Buffers on legacy Intel architectures). + // Exceeding this generates catastrophic silent memory stalls. + HWY_DASSERT(cachelines.count * actual_shallow <= Limits::kNumMSHRs); +#endif + for (size_t r = 0; r < cachelines.count; ++r) { + ShallowPrefetchFn(cachelines.ptrs[r]); + } + }; + + // Hoisted state variables to avoid tight-loop reallocation thrashing: + PrefetchCachelines cachelines; + + // ------------------------------------------------------------------------ + // Branchless loop execution via compile-time strategy + // ------------------------------------------------------------------------ + + // NOTE: Use a strict `if constexpr ... else if constexpr ... else` chain to + // discard not matched branches at compile time. + if constexpr (Limits::kStrategy == PrefetchStrategy::kNoPrefetch) { + for (size_t i = start; i < end; ++i) { + task(i); + } + } else if constexpr (Limits::kStrategy == + PrefetchStrategy::kShallowLookaheadOnly) { + const size_t limit_shallow = + (end > start + actual_shallow) ? end - actual_shallow : start; + // Startup prefetching + for (size_t i = start; i < initial_shallow_prefetch_end; ++i) { + cachelines.count = 0; + get_cachelines(i, cachelines); + execute_shallow_prefetch(cachelines); + } + // Main sliding loop + for (size_t i = start; i < limit_shallow; ++i) { + cachelines.count = 0; + get_cachelines(i + actual_shallow, cachelines); + execute_shallow_prefetch(cachelines); + task(i); + } + // Task drain + for (size_t i = limit_shallow; i < end; ++i) { + task(i); + } + } else if constexpr (Limits::kStrategy == + PrefetchStrategy::kDeepLookaheadOnly) { + const size_t limit_deep = + (end > start + actual_deep) ? end - actual_deep : start; + // Startup prefetching + for (size_t i = start; i < initial_deep_prefetch_end; ++i) { + cachelines.count = 0; + get_cachelines(i, cachelines); + execute_deep_prefetch(cachelines); + } + // Main sliding loop + for (size_t i = start; i < limit_deep; ++i) { + cachelines.count = 0; + get_cachelines(i + actual_deep, cachelines); + execute_deep_prefetch(cachelines); + task(i); + } + // Task drain + for (size_t i = limit_deep; i < end; ++i) { + task(i); + } + } else if constexpr (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep || + Limits::kStrategy == + PrefetchStrategy::kMiniBatchShallow) { + const size_t batch_size = + (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep) + ? actual_deep + : actual_shallow; + + for (size_t b = start; b < end; b += batch_size) { + const size_t b_end = HWY_MIN(b + batch_size, end); + for (size_t p = b; p < b_end; ++p) { + cachelines.count = 0; + get_cachelines(p, cachelines); + if constexpr (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep) { + execute_deep_prefetch(cachelines); + } else { + execute_shallow_prefetch(cachelines); + } + } + for (size_t p = b; p < b_end; ++p) { + task(p); + } + } + } else { + // Fallback: Default Staggered Pipeline (PrefetchStrategy::kDualTier) + // A meticulously unrolled Dual-Tier pipeline is functionally necessary here + // to stage data into L3 before pulling it into L1, preventing LFB stalls + // on Intel, while providing optimal micro-pipelining on Zen/Maple. + + // Phase 1: Overlapping L1 (shallow) and L3 (deep) startup pipeline horizons + for (size_t i = start; i < initial_shallow_prefetch_end; ++i) { + cachelines.count = 0; + get_cachelines(i, cachelines); + execute_deep_prefetch(cachelines); + execute_shallow_prefetch(cachelines); + } + + // Phase 2: Outstanding L3 (deep) horizons which haven't entered L1 window + for (size_t i = initial_shallow_prefetch_end; i < initial_deep_prefetch_end; + ++i) { + cachelines.count = 0; + get_cachelines(i, cachelines); + execute_deep_prefetch(cachelines); + } + + // Phase 3: Main execution. + // Instead of a single loop with bounds-checking `if` statements, we split + // it into three branchless phases. This avoids branch mispredictions in the + // hot loop and prevents the user provided `get_prefetch_cachelines` from + // ever receiving an out-of-bounds index (requiring them to defensively + // handle it). + + const size_t limit_deep = (actual_deep == 0 || start + actual_deep >= end) + ? start + : end - actual_deep; + const size_t limit_shallow = + (actual_shallow == 0 || start + actual_shallow >= end) + ? start + : end - actual_shallow; + + // 3a: Both deep and shallow prefetches are within bounds. + for (size_t i = start; i < limit_deep; ++i) { + cachelines.count = 0; + get_cachelines(i + actual_deep, cachelines); + execute_deep_prefetch(cachelines); + + if (actual_shallow > 0) { + cachelines.count = 0; + get_cachelines(i + actual_shallow, cachelines); + execute_shallow_prefetch(cachelines); + } + + task(i); + } + + // 3b: Only shallow prefetch is within bounds. + for (size_t i = limit_deep; i < limit_shallow; ++i) { + cachelines.count = 0; + get_cachelines(i + actual_shallow, cachelines); + execute_shallow_prefetch(cachelines); + + task(i); + } + + // 3c: No prefetches are within bounds, just finish the tasks. + const size_t task_drain_start = HWY_MAX(limit_deep, limit_shallow); + for (size_t i = task_drain_start; i < end; ++i) { + task(i); + } + } + + if (HWY_UNLIKELY(args.metric_collector_cb != nullptr)) { + args.metric_collector_cb(args.user_data, hwy::timer::Stop() - t0); + } +} + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_ diff --git a/hwy/contrib/pipeline/prefetch_pipeline_2d.h b/hwy/contrib/pipeline/prefetch_pipeline_2d.h new file mode 100644 index 0000000000..87b50053d8 --- /dev/null +++ b/hwy/contrib/pipeline/prefetch_pipeline_2d.h @@ -0,0 +1,141 @@ +// Copyright 2026 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_ +#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_ + +#include + +#include "hwy/base.h" +#include "hwy/contrib/pipeline/prefetch_args.h" +#include "hwy/contrib/pipeline/prefetch_pipeline.h" +#include "hwy/ops/shared-inl.h" +#include "hwy/timer.h" + +namespace hwy { + +struct Default2DTiledPrefetchLimits : public DefaultPrefetchLimits {}; + +#if defined(__cpp_concepts) && __cpp_concepts >= 201907L +// --------------------------------------------------------------------------- +// 2D-Tiled Pipeline Callbacks Concept +// --------------------------------------------------------------------------- +// To use PrefetchPipeline2DTiledLoop, the user must provide a callbacks object +// that adheres to the following signatures: +// +// // Called before evaluating the inner loop phases for an outer block. +// // Useful for allocating or resetting local accumulators on the stack. +// void OnOuterBlockStart(size_t outer_idx, size_t outer_end); +// +// // Called before processing a new inner segment across all items in the +// // current outer block. Useful for broadcasting data into SIMD registers. +// void PrepareInnerBlock(size_t outer_idx, size_t outer_end, +// size_t inner_idx, size_t inner_end); +// +// // Evaluates the sequence index `outer_i` and adds memory pointers to the +// // `cachelines` collection to be prefetched for the given inner block. +// template +// void GetPrefetchCachelines( +// size_t outer_i, size_t inner_idx, size_t inner_end, +// PrefetchCachelines& cachelines); +// +// // The core loop body to execute for a given sequence index `outer_i` and +// // inner segment. +// void ComputeTask(size_t outer_i, size_t inner_idx, size_t inner_end); +// +// // Called after all inner segments have been processed for the outer block. +// // Useful for committing accumulated data. +// void OnOuterBlockFinish(size_t outer_idx, size_t outer_end); +template +concept PrefetchPipeline2DTiledCallbacks = + requires(T& cb, size_t outer_idx, size_t outer_end, size_t inner_idx, + size_t inner_end, PrefetchCachelines& cachelines) { + { cb.OnOuterBlockStart(outer_idx, outer_end) }; + { cb.PrepareInnerBlock(outer_idx, outer_end, inner_idx, inner_end) }; + { cb.GetPrefetchCachelines(outer_idx, inner_idx, inner_end, cachelines) }; + { cb.ComputeTask(outer_idx, inner_idx, inner_end) }; + { cb.OnOuterBlockFinish(outer_idx, outer_end) }; + }; +#endif + +// --------------------------------------------------------------------------- +// PrefetchPipeline2DTiledLoop +// --------------------------------------------------------------------------- +// A generic pipeline for executing 2D block-tiled computations with +// prefetching. This splits processing into localized blocks that prevent cache +// evictions and FPU instruction bottlenecks. +// +// Template Parameters: +// Policy: A struct extending `Default2DTiledPrefetchLimits` dictating cache +// limits, tiling dimensions, and loop constants. +// Callbacks: A type matching the signature documented in +// `PrefetchPipeline2DTiledCallbacks` above. Passed by reference to +// allow state mutation. +// args: A `Prefetch2DArgs` object dictating prefetch lookahead and tiling +// dimensions. Note the telemetry fields, if set, will cover the entire +// 2D loop once. +template +#if defined(__cpp_concepts) && __cpp_concepts >= 201907L + requires PrefetchPipeline2DTiledCallbacks +#endif +HWY_INLINE void PrefetchPipeline2DTiledLoop(size_t outer_size, + size_t inner_size, Callbacks& cb, + const Prefetch2DArgs& args) { + const uint64_t t0 = + args.prefetch.metric_collector_cb != nullptr ? hwy::timer::Start() : 0; + + // We explicitly disable the metric collection callback for the internal 1D + // pipeline loop to prevent nested metric tracing. We will trigger the + // callback manually once the entire 2D loop has completed. + Prefetch2DArgs inner_args = args; + inner_args.prefetch.metric_collector_cb = nullptr; + + for (size_t outer_idx = 0; outer_idx < outer_size; + outer_idx += args.outer_block) { + const size_t outer_end = (outer_size < outer_idx + args.outer_block) + ? outer_size + : outer_idx + args.outer_block; + cb.OnOuterBlockStart(outer_idx, outer_end); + + for (size_t inner_idx = 0; inner_idx < inner_size; + inner_idx += args.inner_block) { + const size_t inner_end = (inner_size < inner_idx + args.inner_block) + ? inner_size + : inner_idx + args.inner_block; + + cb.PrepareInnerBlock(outer_idx, outer_end, inner_idx, inner_end); + + hwy::PrefetchPipelineLoop( + outer_idx, outer_end, + [&](size_t i, auto& cachelines) { + cb.GetPrefetchCachelines(i, inner_idx, inner_end, cachelines); + }, + [&](size_t i) { cb.ComputeTask(i, inner_idx, inner_end); }, + inner_args.prefetch); + } + + cb.OnOuterBlockFinish(outer_idx, outer_end); + } + + if (HWY_UNLIKELY(args.prefetch.metric_collector_cb != nullptr)) { + args.prefetch.metric_collector_cb(args.prefetch.user_data, + hwy::timer::Stop() - t0); + } +} + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_ diff --git a/hwy/contrib/pipeline/prefetch_pipeline_test.cc b/hwy/contrib/pipeline/prefetch_pipeline_test.cc new file mode 100644 index 0000000000..063963e726 --- /dev/null +++ b/hwy/contrib/pipeline/prefetch_pipeline_test.cc @@ -0,0 +1,248 @@ +#include "hwy/contrib/pipeline/prefetch_pipeline.h" + +#include +#include +#include +#include + +#include "hwy/contrib/pipeline/prefetch_args.h" +#include "hwy/tests/hwy_gtest.h" + +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +// A generic policy builder to parameterize the loop strategy for testing. +template +struct TestLimits : DefaultPrefetchLimits { + static constexpr PrefetchStrategy kStrategy = TargetStrategy; + static constexpr size_t kMaxCachelinesPerIter = 1; +}; + +// Global trace vector to track the exact chronological sequence of operations. +std::vector g_trace; + +void FakeDeepPrefetch(const void* ptr) { + // "PD" --> Deep Prefetch. + g_trace.push_back("PD(" + std::to_string(reinterpret_cast(ptr)) + + ")"); +} + +void FakeShallowPrefetch(const void* ptr) { + // "PS" --> Shallow Prefetch. + g_trace.push_back("PS(" + std::to_string(reinterpret_cast(ptr)) + + ")"); +} + +// A fake provider that pushes string events into the global trace. +struct FakeCachelinesProvider { + template + void operator()(size_t i, + PrefetchCachelines& cachelines) const { + // "S" --> Supply cachelines. + g_trace.push_back("S(" + std::to_string(i) + ")"); + // Add a dummy pointer so the `Prefetch` Assembly compiles cleanly natively. + cachelines.Add(reinterpret_cast(i)); + } +}; + +// A fake task to assert chronological execution order. +struct FakeTask { + void operator()(size_t i) const { + // "T" --> Task. + g_trace.push_back("T(" + std::to_string(i) + ")"); + } +}; + +class PrefetchPipelineTest : public ::testing::Test { + protected: + void SetUp() override { g_trace.clear(); } +}; + +template +void CallPipeline(size_t start, size_t end) { + PrefetchArgs args; + args.deep_lookahead = Deep; + args.shallow_lookahead = Shallow; + PrefetchPipelineLoop, FakeCachelinesProvider, + FakeTask, FakeDeepPrefetch, FakeShallowPrefetch>( + start, end, FakeCachelinesProvider(), FakeTask(), args); +} + +TEST_F(PrefetchPipelineTest, NoPrefetchStrategy) { + CallPipeline(0, 5); + + std::vector expected = {"T(0)", "T(1)", "T(2)", "T(3)", "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, DualTierStrategy) { + // Dual-Tier relies on a deeply staggered Phase 1, Phase 2, Phase 3 pipeline. + // We test on an array of length 6. Lookaheads: kShallow = 2, kDeep = 4. + CallPipeline(0, 6); + + std::vector expected = { + // Phase 1: Overlapping limits. L1/L3 horizons are primed (i = 0 to 1). + // Note: get_cachelines is called once per `i` and then distributed to + // deep/shallow. + "S(0)", "PD(0)", "PS(0)", "S(1)", "PD(1)", "PS(1)", + // Phase 2: Outstanding L3. L1 window is exhausted (i = 2 to 3). + "S(2)", "PD(2)", "S(3)", "PD(3)", + // Phase 3a: Main Sequence (i = 0 to 1). + "S(4)", "PD(4)", "S(2)", "PS(2)", "T(0)", "S(5)", "PD(5)", "S(3)", + "PS(3)", "T(1)", + // Phase 3b: Limit Deep (i = 2 to 3). Deep lookahead has reached array + // bounds. + "S(4)", "PS(4)", "T(2)", "S(5)", "PS(5)", "T(3)", + // Phase 3c: Drain (i = 4 to 5). No fetches remaining. + "T(4)", "T(5)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, DualTierStrategy_DifferentLookaheads) { + // Test with kShallow = 1, kDeep = 3 on an array of length 5. + CallPipeline(0, 5); + + std::vector expected = { + // Phase 1: Overlapping limits. L1/L3 horizons are primed (i = 0 to 0). + "S(0)", "PD(0)", "PS(0)", + // Phase 2: Outstanding L3. L1 window is exhausted (i = 1 to 2). + "S(1)", "PD(1)", "S(2)", "PD(2)", + // Phase 3a: Main Sequence (i = 0 to 1). + // i=0 triggers deep+3 (3) and shallow+1 (1) + "S(3)", "PD(3)", "S(1)", "PS(1)", "T(0)", + // i=1 triggers deep+3 (4) and shallow+1 (2) + "S(4)", "PD(4)", "S(2)", "PS(2)", "T(1)", + // Phase 3b: Limit Deep (i = 2 to 3). Deep lookahead ends. + // i=2 shallow+1 (3) + "S(3)", "PS(3)", "T(2)", + // i=3 shallow+1 (4) + "S(4)", "PS(4)", "T(3)", + // Phase 3c: Drain (i = 4 to 4). No fetches remaining. + "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, ShallowRollingLookaheadStrategy) { + // Tests the 1D rolling array. Only shallow lookahead is active (kShallow=2). + CallPipeline(0, 6); + + std::vector expected = {// Startup Phase (i = 0 to 1) for L1 + "S(0)", "PS(0)", "S(1)", "PS(1)", + // Main Sliding Loop (i = 0 to 3) + "S(2)", "PS(2)", "T(0)", "S(3)", "PS(3)", + "T(1)", "S(4)", "PS(4)", "T(2)", "S(5)", + "PS(5)", "T(3)", + // Drain (i = 4 to 5) + "T(4)", "T(5)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, DeepRollingLookaheadStrategy) { + // Tests the 1D rolling array using the Deep boundary (kDeep=4). + CallPipeline(0, 6); + + std::vector expected = { + // Startup Phase (i = 0 to 3) for L3 (kDeep = 4) + "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)", "PD(2)", "S(3)", "PD(3)", + // Main Sliding Loop (i = 0 to 1) + "S(4)", "PD(4)", "T(0)", "S(5)", "PD(5)", "T(1)", + // Drain (i = 2 to 5) + "T(2)", "T(3)", "T(4)", "T(5)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, MiniBatchShallowStrategy) { + // Tests the blocked/batch prefetch array. Shallow lookahead = 2 = Batch size. + CallPipeline(0, 5); + + std::vector expected = { + // Block 1 (i = 0 to 1) + "S(0)", "PS(0)", "S(1)", "PS(1)", "T(0)", "T(1)", + // Block 2 (i = 2 to 3) + "S(2)", "PS(2)", "S(3)", "PS(3)", "T(2)", "T(3)", + // Block 3 (remainder, i = 4 to 4) + "S(4)", "PS(4)", "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, MiniBatchDeepStrategy) { + // Tests the blocked/batch prefetch array. Deep lookahead = 4 = Batch size. + CallPipeline(0, 5); + + std::vector expected = {// Block 1 (i = 0 to 3) + "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)", + "PD(2)", "S(3)", "PD(3)", "T(0)", "T(1)", + "T(2)", "T(3)", + // Block 2 (remainder, i = 4 to 4) + "S(4)", "PD(4)", "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, ZeroShallowZeroDeepFallback) { + // Disabling both tiers should degrade down to NoPrefetch behavior entirely. + CallPipeline(0, 5); + + std::vector expected = {"T(0)", "T(1)", "T(2)", "T(3)", "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, ZeroShallowFallback) { + // If shallow is 0, we degrade to pure DeepOnly logic (e.g. Rolling L3 Only) + CallPipeline(0, 4); + + std::vector expected = { + // Startup Deep + "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)", "PD(2)", + // Sliding Loop + "S(3)", "PD(3)", "T(0)", + // Drain + "T(1)", "T(2)", "T(3)"}; + EXPECT_EQ(g_trace, expected); +} + +TEST_F(PrefetchPipelineTest, ShallowGreaterOrEqualDeepFallback) { + // If shallow >= deep, the L3 tier is bypassed entirely protecting LFBs. + // Tests DualTier logic natively degrading to Rolling L1-only behavior. + CallPipeline(0, 5); + + std::vector expected = { + // Startup Shallow (for 4 steps) + "S(0)", "PS(0)", "S(1)", "PS(1)", "S(2)", "PS(2)", "S(3)", "PS(3)", + // Sliding Loop + "S(4)", "PS(4)", "T(0)", + // Drain + "T(1)", "T(2)", "T(3)", "T(4)"}; + EXPECT_EQ(g_trace, expected); +} + +struct TestMetricContext { + bool called = false; + uint64_t ticks = 0; +}; + +void FakeMetricCollectorCb(void* user_data, uint64_t elapsed_ticks) { + auto* ctx = static_cast(user_data); + ctx->called = true; + ctx->ticks = elapsed_ticks; +} + +TEST_F(PrefetchPipelineTest, MetricCollectorCallback) { + TestMetricContext ctx; + PrefetchArgs args; + args.metric_collector_cb = FakeMetricCollectorCb; + args.user_data = &ctx; + + PrefetchPipelineLoop, + FakeCachelinesProvider, FakeTask, FakeDeepPrefetch, + FakeShallowPrefetch>(0, 5, FakeCachelinesProvider(), + FakeTask(), args); + + EXPECT_TRUE(ctx.called); + // Elapsed ticks can be small, but it guarantees the callback was fully fired. +} + +} // namespace +} // namespace HWY_NAMESPACE + +} // namespace hwy diff --git a/hwy_tests.bzl b/hwy_tests.bzl index 8a3eefb48d..1d3cd1122c 100644 --- a/hwy_tests.bzl +++ b/hwy_tests.bzl @@ -62,6 +62,13 @@ HWY_CONTRIB_TESTS = ( "math_hyper_test", [":math"], ), + # copybara:strip_begin(internal) + ( + "hwy/contrib/pipeline/", + "prefetch_pipeline_test", + [":prefetch_pipeline"], + ), + # copybara:strip_end ( "hwy/contrib/math/", "math_tan_test",