From 06a765cc72651997553500247079bb237e6fb2bf Mon Sep 17 00:00:00 2001
From: Highway <no-reply@google.com>
Date: Wed, 10 Jun 2026 11:45:29 -0700
Subject: [PATCH] N/A

PiperOrigin-RevId: 929984949
---
 BUILD                                         |  67 ++-
 hwy/cache_control.h                           |  38 +-
 hwy/contrib/pipeline/prefetch_args.h          | 114 +++++
 hwy/contrib/pipeline/prefetch_pipeline.h      | 393 ++++++++++++++++++
 hwy/contrib/pipeline/prefetch_pipeline_2d.h   | 141 +++++++
 .../pipeline/prefetch_pipeline_test.cc        | 248 +++++++++++
 hwy_tests.bzl                                 |   7 +
 7 files changed, 985 insertions(+), 23 deletions(-)
 create mode 100644 hwy/contrib/pipeline/prefetch_args.h
 create mode 100644 hwy/contrib/pipeline/prefetch_pipeline.h
 create mode 100644 hwy/contrib/pipeline/prefetch_pipeline_2d.h
 create mode 100644 hwy/contrib/pipeline/prefetch_pipeline_test.cc
diff --git a/BUILD b/BUILD
index 765546b987..c552a373f1 100644
--- a/BUILD
+++ b/BUILD
@@ -4,6 +4,7 @@ load("@rules_cc//cc:cc_test.bzl", "cc_test")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//:hwy_tests.bzl", "HWY_TESTS")
 load("@rules_license//rules:license.bzl", "license")
+load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library")
 
 package(
     default_applicable_licenses = [":license"],
@@ -118,6 +119,27 @@ COPTS = select({
     ],
 })
 
+HWY_TEST_COPTS = select({
+    ":compiler_msvc": [],
+    "//conditions:default": [
+        # gTest triggers this warning (which is enabled by the
+        # extra-semi in COPTS), so we need to disable it here,
+        # but it's still enabled for :hwy.
+        "-Wno-c++98-compat-extra-semi",
+    ],
+})
+
+# Common to all tests.
+HWY_TEST_DEPS = [
+    ":hwy_test_util",
+    ":hwy",
+    ":nanobenchmark",
+    ":timer",
+] + select({
+    ":compiler_msvc": [],
+    "//conditions:default": ["@com_google_googletest//:gtest_main"],
+})
+
 DEFINES = select({
     ":compiler_msvc": ["HWY_SHARED_DEFINE"],
     ":compiler_clangcl": ["HWY_SHARED_DEFINE"],
@@ -606,6 +628,23 @@ cc_library(
     ],
 )
 
+# copybara:strip_begin(internal)
+cc_library(
+    name = "prefetch_pipeline",
+    hdrs = [
+        "hwy/contrib/pipeline/prefetch_args.h",
+        "hwy/contrib/pipeline/prefetch_pipeline.h",
+        "hwy/contrib/pipeline/prefetch_pipeline_2d.h",
+    ],
+    compatible_with = [],
+    copts = COPTS,
+    deps = [
+        ":hwy",
+        ":timer",
+    ],
+)
+# copybara:strip_end
+
 cc_test(
     name = "list_targets",
     size = "small",
@@ -627,27 +666,6 @@ cc_test(
     ],
 )
 
-HWY_TEST_COPTS = select({
-    ":compiler_msvc": [],
-    "//conditions:default": [
-        # gTest triggers this warning (which is enabled by the
-        # extra-semi in COPTS), so we need to disable it here,
-        # but it's still enabled for :hwy.
-        "-Wno-c++98-compat-extra-semi",
-    ],
-})
-
-# Common to all tests.
-HWY_TEST_DEPS = [
-    ":hwy_test_util",
-    ":hwy",
-    ":nanobenchmark",
-    ":timer",
-] + select({
-    ":compiler_msvc": [],
-    "//conditions:default": ["@com_google_googletest//:gtest_main"],
-})
-
 [
     [
         cc_test(
@@ -713,3 +731,10 @@ test_suite(
     name = "hwy_ops_tests",
     tags = ["hwy_ops_test"],
 )
+
+bzl_library(
+    name = "hwy_tests_bzl",
+    srcs = ["hwy_tests.bzl"],
+    parse_tests = False,
+    visibility = ["//visibility:private"],
+)
diff --git a/hwy/cache_control.h b/hwy/cache_control.h
index 90743cd3f2..0df23e17cc 100644
--- a/hwy/cache_control.h
+++ b/hwy/cache_control.h
@@ -92,8 +92,10 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
 #endif
 }
 
-// Optionally begins loading the cache line containing "p" to reduce latency of
-// subsequent actual loads.
+// Optionally begins loading the cache line containing "p" into all cache
+// levels, including L1, to reduce latency of subsequent actual loads. This
+// corresponds to the T0 temporal locality hint on x86, which is ideal when data
+// is about to be directly consumed.
 template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
   (void)p;
@@ -109,6 +111,38 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #endif  //  HWY_DISABLE_CACHE_CONTROL
 }
 
+// Begins loading the cache line containing "p" into the L1 cache only, passing
+// a Non-Temporal Access (NTA) hint. This minimizes pollution of outer memory
+// caches (L2/L3) and is ideal for data accessed exactly once.
+template <typename T>
+HWY_INLINE HWY_ATTR_CACHE void ShallowPrefetch(const T* p) {
+  (void)p;
+#ifndef HWY_DISABLE_CACHE_CONTROL
+#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
+  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_NTA);
+#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL  // includes clang
+  // Hint=0 specifically sets Non-Temporal local locality
+  __builtin_prefetch(p, /*write=*/0, /*hint=*/0);
+#endif
+#endif  //  HWY_DISABLE_CACHE_CONTROL
+}
+
+// Attempts to stage the cache line containing "p" into the L3/L2 outer caches
+// without aggressively staging it immediately into the L1. This restricts L1
+// and LFB thrashing on architectures like Intel when hiding massive DRAM delay.
+template <typename T>
+HWY_INLINE HWY_ATTR_CACHE void DeepPrefetch(const T* p) {
+  (void)p;
+#ifndef HWY_DISABLE_CACHE_CONTROL
+#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
+  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T2);
+#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL  // includes clang
+  // Hint=1 requests Moderate degrees of temporal locality (L2/L3 bounds)
+  __builtin_prefetch(p, /*write=*/0, /*hint=*/1);
+#endif
+#endif  //  HWY_DISABLE_CACHE_CONTROL
+}
+
 // Invalidates and flushes the cache line containing "p", if possible.
 HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
diff --git a/hwy/contrib/pipeline/prefetch_args.h b/hwy/contrib/pipeline/prefetch_args.h
new file mode 100644
index 0000000000..650f072c3f
--- /dev/null
+++ b/hwy/contrib/pipeline/prefetch_args.h
@@ -0,0 +1,114 @@
+// Copyright 2026 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_
+#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace hwy {
+
+struct PrefetchArgs {
+  // The iteration distance (in loop iterations) to look ahead for the deep L3
+  // prefetch. If 0, no deep prefetch will be issued.
+  size_t deep_lookahead = 32;
+
+  // The iteration distance (in loop iterations) to look ahead for the shallow
+  // L1 prefetch. If 0, no shallow prefetch will be issued.
+  size_t shallow_lookahead = 4;
+
+  // -------------------------------------------------------------------------
+  // Telemetry Conduit
+  // -------------------------------------------------------------------------
+  // A generic callback executed upon pipeline completion to report performance
+  // metrics (typically elapsed time). Its signature avoids std::function to
+  // maintain zero-overhead C-style linkage and ensure the struct remains
+  // trivially copyable without heap allocations.
+  //   user_data: Custom context pointer returned to the callback.
+  //   elapsed_ticks: Raw cycle ticks taken to evaluate the pipeline loop.
+  void (*metric_collector_cb)(void* user_data,
+                              uint64_t elapsed_ticks) = nullptr;
+  void* user_data = nullptr;
+
+  // -------------------------------------------------------------------------
+  // Safe Default Factories
+  // -------------------------------------------------------------------------
+  // Tuning memory prefetching is notoriously difficult because lookahead bounds
+  // change dramatically depending on the spatial distribution of the workload.
+
+  // Random Access / Scatter-Gather (e.g. Hash Table Probing, Graph Walks)
+  //
+  // Random array accesses constantly suffer TLB (Translation Lookaside Buffer)
+  // misses, resulting in massive Page Walk delays. To absorb these colossal
+  // ~300-cycle stalls natively inside the L3 queue, the deep lookahead must
+  // aggressively stretch out by large margins (e.g. 32-48 iterations).
+  static constexpr PrefetchArgs DefaultRandom() {
+#if HWY_ARCH_ARM_A64
+    return PrefetchArgs{.deep_lookahead = 64, .shallow_lookahead = 8};
+#else
+    return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4};
+#endif
+  }
+
+  // Sequential Scans / Linear Memory (e.g. Matrix Vector, Filter Scans)
+  //
+  // Linear accesses benefit intimately from native CPU stream-trackers (which
+  // already mask bulk DRAM latency). Here, a heavy L3 lookahead is
+  // counter-productive; it merely crowds the queue. Instead, we tighten the
+  // lookaheads down to safely bridge the narrower L3 -> L1 latency gap
+  // (~40 cycles) without overflowing LFBs during heavy SIMD evaluation.
+  static constexpr PrefetchArgs DefaultSequential() {
+#if HWY_ARCH_ARM_A64
+    return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4};
+#else
+    return PrefetchArgs{.deep_lookahead = 8, .shallow_lookahead = 2};
+#endif
+  }
+};
+
+// ---------------------------------------------------------------------------
+// 2D-Tiled Prefetch Policy
+// ---------------------------------------------------------------------------
+// Extends the base PrefetchPolicy to include standard 2D tiling constants.
+struct Prefetch2DArgs {
+  PrefetchArgs prefetch;
+  size_t outer_block = 128;
+  size_t inner_block = 256;
+
+  // -------------------------------------------------------------------------
+  // Safe Default Factories
+  // -------------------------------------------------------------------------
+
+  static constexpr Prefetch2DArgs DefaultRandom() {
+    Prefetch2DArgs args;
+    args.prefetch = PrefetchArgs::DefaultRandom();
+    args.outer_block = 128;
+    args.inner_block = 256;
+    return args;
+  }
+
+  static constexpr Prefetch2DArgs DefaultSequential() {
+    Prefetch2DArgs args;
+    args.prefetch = PrefetchArgs::DefaultSequential();
+    args.outer_block = 256;
+    args.inner_block = 512;
+    return args;
+  }
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_
diff --git a/hwy/contrib/pipeline/prefetch_pipeline.h b/hwy/contrib/pipeline/prefetch_pipeline.h
new file mode 100644
index 0000000000..b7342e4b97
--- /dev/null
+++ b/hwy/contrib/pipeline/prefetch_pipeline.h
@@ -0,0 +1,393 @@
+// Copyright 2026 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_
+#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_
+
+#include <stddef.h>
+
+#include "hwy/base.h"
+#include "hwy/cache_control.h"
+#include "hwy/contrib/pipeline/prefetch_args.h"
+#include "hwy/timer.h"
+
+namespace hwy {
+
+// ---------------------------------------------------------------------------
+// PrefetchStrategy
+// ---------------------------------------------------------------------------
+// Enumerates the structural looping algorithm that the pipeline will compile
+// down into. Used to decouple hardware limits from loop execution mechanics.
+enum class PrefetchStrategy {
+  kNoPrefetch,
+  kDeepLookaheadOnly,
+  kShallowLookaheadOnly,
+  kMiniBatchDeep,
+  kMiniBatchShallow,
+  kDualTier
+};
+
+// ---------------------------------------------------------------------------
+// PrefetchLimits
+// ---------------------------------------------------------------------------
+// Provides hardware limitation policies for the pipelining loop to optimize
+// execution. A default policy is provided via `DefaultPrefetchLimits`, but
+// users can construct their own via template specializations.
+struct DefaultPrefetchLimits {
+  // The maximum number of explicit cachelines that should be prefetched per
+  // iteration (i.e., cachelines containing the data to be used by the upcoming
+  // compute task).
+  static constexpr size_t kMaxCachelinesPerIter = 4;
+
+  // -------------------------------------------------------------------------
+  // Hardware Architecture Matrix
+  // -------------------------------------------------------------------------
+  // Resolved automatically within the struct using HWY_ARCH_*
+
+#if HWY_ARCH_ARM || HWY_ARCH_ARM_A64
+  // ARM platforms vary by their natures, we use 24 here to be conservative.
+  static constexpr size_t kNumMSHRs = 24;
+
+#elif HWY_ARCH_X86
+  // On x86, we cannot differentiate Intel vs AMD reliably at compile time.
+  // Because hitting the edge of the Intel LFB pool (10-12) causes a hard CPU
+  // stall, we MUST gracefully pick the most restrictive denominator (12) to
+  // ensure safety. AMD Zen architectures uniquely "absorb" the excess prefetch
+  // instructions into their 124 MABs, suffering zero penalty from this wrapper.
+  static constexpr size_t kNumMSHRs = 12;
+
+#else
+  // Safe, generic bounds.
+  static constexpr size_t kNumMSHRs = 12;
+#endif
+
+  // The specific execution strategy the pipeline will adopt.
+  static constexpr PrefetchStrategy kStrategy = PrefetchStrategy::kDualTier;
+};
+
+// ---------------------------------------------------------------------------
+// PrefetchCachelines
+// ---------------------------------------------------------------------------
+// A lightweight, stack-allocated, fixed-capacity container for collecting
+// discrete memory addresses to be prefetched.
+// By strictly accumulating memory pointers individually, it allows precise
+// control over the Line Fill Buffer (LFB) utilization in the pipelining loop.
+template <size_t kMaxCachelinesPerIter =
+              DefaultPrefetchLimits::kMaxCachelinesPerIter>
+struct PrefetchCachelines {
+  // Array of explicit memory addresses to prefetch.
+  const void* ptrs[kMaxCachelinesPerIter];
+
+  // The number of valid pointers currently registered in the array.
+  size_t count = 0;
+
+  // Registers a discrete memory address to be prefetched.
+  // The user should supply the base pointer of the data they intend to access.
+  HWY_INLINE void Add(const void* ptr) {
+    HWY_DASSERT(count < kMaxCachelinesPerIter);
+    ptrs[count] = ptr;
+    ++count;
+  }
+};
+
+#if defined(__cpp_concepts) && __cpp_concepts >= 201907L
+// ---------------------------------------------------------------------------
+// Cachelines Provider Concept
+// ---------------------------------------------------------------------------
+// To use PrefetchPipelineLoop, the user must provide a callable that adheres
+// to the following signature:
+//
+//   template <size_t kMaxCachelinesPerIter>
+//   void operator()(size_t i, PrefetchCachelines<kMaxCachelinesPerIter>&
+//   cachelines) const;
+//
+// Parameters:
+//  - i:                The current sequence index evaluated by the loop.
+//                      This is guaranteed to be in the range `[start, end)`.
+//  - cachelines:       Output collection. Call `cachelines.Add(ptr)` to
+//                      register cachelines. For invalid or conditional
+//                      indices, simply do not add anything.
+//
+// Execution constraints:
+//  - Depending on the architecture, prefetching optimizations, or runtime
+//    auto-tuning limits (see Policy configurations below), there is no
+//    hardware or programmatic guarantee how many times this function will be
+//    called for a given `i` (including zero times if prefetches are stripped
+//    via `constexpr`). Therefore, this callable MUST be completely pure and
+//    strictly side-effect free!
+//  - Missing or zero-length entries are natively skipped by the pipeline
+//    without incurring branching overhead.
+template <typename T, size_t MaxCachelines>
+concept PrefetchPipelineCachelineProvider =
+    requires(const T& provider, size_t i,
+             PrefetchCachelines<MaxCachelines>& cachelines) {
+      { provider(i, cachelines) };
+    };
+
+// ---------------------------------------------------------------------------
+// Pipeline Task Concept
+// ---------------------------------------------------------------------------
+// To use PrefetchPipelineLoop, the user must provide a callable that adheres
+// to the following signature:
+//
+//   void operator()(size_t i) const;
+//
+// Parameters:
+//  - i: The current sequence index being evaluated by the pipeline.
+//
+// Execution constraints:
+//  - Guaranteed to be called exactly once for each index `i` in the range
+//    `[start, end)`. Furthermore, it is guaranteed to be invoked purely
+//    sequentially (i.e. `i`, `i+1`, `i+2`), preserving any cross-iteration
+//    dependencies or internal accumulator state.
+template <typename T>
+concept PrefetchPipelineTask = requires(const T& task, size_t i) {
+  { task(i) };
+};
+#endif
+
+// PrefetchPipelineLoop
+// ---------------------------------------------------------------------------
+// Design Philosophy:
+// While this helper aims to deeply accelerate predictable memory-bound loops,
+// its core tenet is "do no harm in the worst case". By carefully staging
+// memory into L3 before migrating it to L1, and rigorously capping the active
+// footprint against hardware Line Fill Buffer (LFB) limits, it ensures that
+// memory bandwidth is maximized without accidentally thrashing the cache or
+// stalling the processor pipelines.
+//
+// Evaluates a pipelined loop over the range [start, end) with two stages of
+// rolling prefetch lookahead: a deep prefetch (e.g. L3) and a shallow prefetch
+// (e.g. L1). This correctly stages data transitions from main memory -> L3 ->
+// L1 to maximize memory bandwidth and keep CPU Line Fill Buffers from stalling.
+//
+//   Policy: A struct dictating cache limits and loop constants. Defaults to
+//           `DefaultPrefetchLimits`.
+//   CachelinesProvider: A callable type matching the signature documented
+//                       in `PrefetchPipelineCachelineProvider` above. It
+//                       resolves cacheline pointers for a given index `i`.
+//   TaskFn: A callable type matching the signature documented in
+//           `PrefetchPipelineTask` above. It represents the core evaluation
+//           logic for index `i`.
+//   args: A `PrefetchArgs` configuration that controls the deep (L3) and
+//         shallow (L1) lookahead pipeline distances. Passing a reasonable
+//         value is crucial for optimal hardware performance. Ideally, use an
+//         auto-tuned configuration or an explicit architecture preset.
+template <typename Limits = DefaultPrefetchLimits, typename CachelinesProvider,
+          typename TaskFn,
+          // Allow overriding the prefetch functions for testing.
+          void (*DeepPrefetchFn)(const void*) = DeepPrefetch,
+          void (*ShallowPrefetchFn)(const void*) = ShallowPrefetch>
+#if defined(__cpp_concepts) && __cpp_concepts >= 201907L
+  requires PrefetchPipelineTask<TaskFn> &&
+           PrefetchPipelineCachelineProvider<CachelinesProvider,
+                                             Limits::kMaxCachelinesPerIter>
+#endif
+HWY_INLINE void PrefetchPipelineLoop(size_t start, size_t end,
+                                     const CachelinesProvider& get_cachelines,
+                                     const TaskFn& task,
+                                     const PrefetchArgs& args) {
+  const uint64_t t0 =
+      args.metric_collector_cb != nullptr ? hwy::timer::Start() : 0;
+  // Gracefully degrade inverted configurations if AutoTune generates them.
+  size_t actual_shallow = args.shallow_lookahead;
+  size_t actual_deep = args.deep_lookahead;
+  HWY_DASSERT(actual_deep == 0 || actual_deep > actual_shallow);
+  if (actual_shallow >= actual_deep) {
+    actual_deep = 0;
+  }
+
+  const size_t initial_shallow_prefetch_end =
+      HWY_MIN(start + actual_shallow, end);
+  const size_t initial_deep_prefetch_end = HWY_MIN(start + actual_deep, end);
+
+  // Reusable cache injection loops:
+  auto execute_deep_prefetch =
+      [&](const PrefetchCachelines<Limits::kMaxCachelinesPerIter>& cachelines)
+          HWY_ATTR_CACHE {
+            if (actual_deep == 0) return;
+            for (size_t r = 0; r < cachelines.count; ++r) {
+              DeepPrefetchFn(cachelines.ptrs[r]);
+            }
+          };
+  auto execute_shallow_prefetch =
+      [&](const PrefetchCachelines<Limits::kMaxCachelinesPerIter>& cachelines)
+          HWY_ATTR_CACHE {
+#if HWY_IS_DEBUG_BUILD
+            // Safeguard: The active L1 hardware queue footprint should not
+            // exceed the physical Miss Status Holding Registers (MSHRs).
+            // (e.g. 10-12 Line Fill Buffers on legacy Intel architectures).
+            // Exceeding this generates catastrophic silent memory stalls.
+            HWY_DASSERT(cachelines.count * actual_shallow <= Limits::kNumMSHRs);
+#endif
+            for (size_t r = 0; r < cachelines.count; ++r) {
+              ShallowPrefetchFn(cachelines.ptrs[r]);
+            }
+          };
+
+  // Hoisted state variables to avoid tight-loop reallocation thrashing:
+  PrefetchCachelines<Limits::kMaxCachelinesPerIter> cachelines;
+
+  // ------------------------------------------------------------------------
+  // Branchless loop execution via compile-time strategy
+  // ------------------------------------------------------------------------
+
+  // NOTE: Use a strict `if constexpr ... else if constexpr ... else` chain to
+  // discard not matched branches at compile time.
+  if constexpr (Limits::kStrategy == PrefetchStrategy::kNoPrefetch) {
+    for (size_t i = start; i < end; ++i) {
+      task(i);
+    }
+  } else if constexpr (Limits::kStrategy ==
+                       PrefetchStrategy::kShallowLookaheadOnly) {
+    const size_t limit_shallow =
+        (end > start + actual_shallow) ? end - actual_shallow : start;
+    // Startup prefetching
+    for (size_t i = start; i < initial_shallow_prefetch_end; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i, cachelines);
+      execute_shallow_prefetch(cachelines);
+    }
+    // Main sliding loop
+    for (size_t i = start; i < limit_shallow; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i + actual_shallow, cachelines);
+      execute_shallow_prefetch(cachelines);
+      task(i);
+    }
+    // Task drain
+    for (size_t i = limit_shallow; i < end; ++i) {
+      task(i);
+    }
+  } else if constexpr (Limits::kStrategy ==
+                       PrefetchStrategy::kDeepLookaheadOnly) {
+    const size_t limit_deep =
+        (end > start + actual_deep) ? end - actual_deep : start;
+    // Startup prefetching
+    for (size_t i = start; i < initial_deep_prefetch_end; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i, cachelines);
+      execute_deep_prefetch(cachelines);
+    }
+    // Main sliding loop
+    for (size_t i = start; i < limit_deep; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i + actual_deep, cachelines);
+      execute_deep_prefetch(cachelines);
+      task(i);
+    }
+    // Task drain
+    for (size_t i = limit_deep; i < end; ++i) {
+      task(i);
+    }
+  } else if constexpr (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep ||
+                       Limits::kStrategy ==
+                           PrefetchStrategy::kMiniBatchShallow) {
+    const size_t batch_size =
+        (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep)
+            ? actual_deep
+            : actual_shallow;
+
+    for (size_t b = start; b < end; b += batch_size) {
+      const size_t b_end = HWY_MIN(b + batch_size, end);
+      for (size_t p = b; p < b_end; ++p) {
+        cachelines.count = 0;
+        get_cachelines(p, cachelines);
+        if constexpr (Limits::kStrategy == PrefetchStrategy::kMiniBatchDeep) {
+          execute_deep_prefetch(cachelines);
+        } else {
+          execute_shallow_prefetch(cachelines);
+        }
+      }
+      for (size_t p = b; p < b_end; ++p) {
+        task(p);
+      }
+    }
+  } else {
+    // Fallback: Default Staggered Pipeline (PrefetchStrategy::kDualTier)
+    // A meticulously unrolled Dual-Tier pipeline is functionally necessary here
+    // to stage data into L3 before pulling it into L1, preventing LFB stalls
+    // on Intel, while providing optimal micro-pipelining on Zen/Maple.
+
+    // Phase 1: Overlapping L1 (shallow) and L3 (deep) startup pipeline horizons
+    for (size_t i = start; i < initial_shallow_prefetch_end; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i, cachelines);
+      execute_deep_prefetch(cachelines);
+      execute_shallow_prefetch(cachelines);
+    }
+
+    // Phase 2: Outstanding L3 (deep) horizons which haven't entered L1 window
+    for (size_t i = initial_shallow_prefetch_end; i < initial_deep_prefetch_end;
+         ++i) {
+      cachelines.count = 0;
+      get_cachelines(i, cachelines);
+      execute_deep_prefetch(cachelines);
+    }
+
+    // Phase 3: Main execution.
+    // Instead of a single loop with bounds-checking `if` statements, we split
+    // it into three branchless phases. This avoids branch mispredictions in the
+    // hot loop and prevents the user provided `get_prefetch_cachelines` from
+    // ever receiving an out-of-bounds index (requiring them to defensively
+    // handle it).
+
+    const size_t limit_deep = (actual_deep == 0 || start + actual_deep >= end)
+                                  ? start
+                                  : end - actual_deep;
+    const size_t limit_shallow =
+        (actual_shallow == 0 || start + actual_shallow >= end)
+            ? start
+            : end - actual_shallow;
+
+    // 3a: Both deep and shallow prefetches are within bounds.
+    for (size_t i = start; i < limit_deep; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i + actual_deep, cachelines);
+      execute_deep_prefetch(cachelines);
+
+      if (actual_shallow > 0) {
+        cachelines.count = 0;
+        get_cachelines(i + actual_shallow, cachelines);
+        execute_shallow_prefetch(cachelines);
+      }
+
+      task(i);
+    }
+
+    // 3b: Only shallow prefetch is within bounds.
+    for (size_t i = limit_deep; i < limit_shallow; ++i) {
+      cachelines.count = 0;
+      get_cachelines(i + actual_shallow, cachelines);
+      execute_shallow_prefetch(cachelines);
+
+      task(i);
+    }
+
+    // 3c: No prefetches are within bounds, just finish the tasks.
+    const size_t task_drain_start = HWY_MAX(limit_deep, limit_shallow);
+    for (size_t i = task_drain_start; i < end; ++i) {
+      task(i);
+    }
+  }
+
+  if (HWY_UNLIKELY(args.metric_collector_cb != nullptr)) {
+    args.metric_collector_cb(args.user_data, hwy::timer::Stop() - t0);
+  }
+}
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_H_
diff --git a/hwy/contrib/pipeline/prefetch_pipeline_2d.h b/hwy/contrib/pipeline/prefetch_pipeline_2d.h
new file mode 100644
index 0000000000..87b50053d8
--- /dev/null
+++ b/hwy/contrib/pipeline/prefetch_pipeline_2d.h
@@ -0,0 +1,141 @@
+// Copyright 2026 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_
+#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_
+
+#include <stddef.h>
+
+#include "hwy/base.h"
+#include "hwy/contrib/pipeline/prefetch_args.h"
+#include "hwy/contrib/pipeline/prefetch_pipeline.h"
+#include "hwy/ops/shared-inl.h"
+#include "hwy/timer.h"
+
+namespace hwy {
+
+struct Default2DTiledPrefetchLimits : public DefaultPrefetchLimits {};
+
+#if defined(__cpp_concepts) && __cpp_concepts >= 201907L
+// ---------------------------------------------------------------------------
+// 2D-Tiled Pipeline Callbacks Concept
+// ---------------------------------------------------------------------------
+// To use PrefetchPipeline2DTiledLoop, the user must provide a callbacks object
+// that adheres to the following signatures:
+//
+//   // Called before evaluating the inner loop phases for an outer block.
+//   // Useful for allocating or resetting local accumulators on the stack.
+//   void OnOuterBlockStart(size_t outer_idx, size_t outer_end);
+//
+//   // Called before processing a new inner segment across all items in the
+//   // current outer block. Useful for broadcasting data into SIMD registers.
+//   void PrepareInnerBlock(size_t outer_idx, size_t outer_end,
+//                          size_t inner_idx, size_t inner_end);
+//
+//   // Evaluates the sequence index `outer_i` and adds memory pointers to the
+//   // `cachelines` collection to be prefetched for the given inner block.
+//   template <size_t kMaxCachelinesPerIter>
+//   void GetPrefetchCachelines(
+//       size_t outer_i, size_t inner_idx, size_t inner_end,
+//       PrefetchCachelines<kMaxCachelinesPerIter>& cachelines);
+//
+//   // The core loop body to execute for a given sequence index `outer_i` and
+//   // inner segment.
+//   void ComputeTask(size_t outer_i, size_t inner_idx, size_t inner_end);
+//
+//   // Called after all inner segments have been processed for the outer block.
+//   // Useful for committing accumulated data.
+//   void OnOuterBlockFinish(size_t outer_idx, size_t outer_end);
+template <typename T, size_t MaxCachelines>
+concept PrefetchPipeline2DTiledCallbacks =
+    requires(T& cb, size_t outer_idx, size_t outer_end, size_t inner_idx,
+             size_t inner_end, PrefetchCachelines<MaxCachelines>& cachelines) {
+      { cb.OnOuterBlockStart(outer_idx, outer_end) };
+      { cb.PrepareInnerBlock(outer_idx, outer_end, inner_idx, inner_end) };
+      { cb.GetPrefetchCachelines(outer_idx, inner_idx, inner_end, cachelines) };
+      { cb.ComputeTask(outer_idx, inner_idx, inner_end) };
+      { cb.OnOuterBlockFinish(outer_idx, outer_end) };
+    };
+#endif
+
+// ---------------------------------------------------------------------------
+// PrefetchPipeline2DTiledLoop
+// ---------------------------------------------------------------------------
+// A generic pipeline for executing 2D block-tiled computations with
+// prefetching. This splits processing into localized blocks that prevent cache
+// evictions and FPU instruction bottlenecks.
+//
+// Template Parameters:
+//   Policy: A struct extending `Default2DTiledPrefetchLimits` dictating cache
+//           limits, tiling dimensions, and loop constants.
+//   Callbacks: A type matching the signature documented in
+//              `PrefetchPipeline2DTiledCallbacks` above. Passed by reference to
+//              allow state mutation.
+//   args: A `Prefetch2DArgs` object dictating prefetch lookahead and tiling
+//         dimensions. Note the telemetry fields, if set, will cover the entire
+//         2D loop once.
+template <typename Limits = Default2DTiledPrefetchLimits, typename Callbacks>
+#if defined(__cpp_concepts) && __cpp_concepts >= 201907L
+  requires PrefetchPipeline2DTiledCallbacks<Callbacks,
+                                            Limits::kMaxCachelinesPerIter>
+#endif
+HWY_INLINE void PrefetchPipeline2DTiledLoop(size_t outer_size,
+                                            size_t inner_size, Callbacks& cb,
+                                            const Prefetch2DArgs& args) {
+  const uint64_t t0 =
+      args.prefetch.metric_collector_cb != nullptr ? hwy::timer::Start() : 0;
+
+  // We explicitly disable the metric collection callback for the internal 1D
+  // pipeline loop to prevent nested metric tracing. We will trigger the
+  // callback manually once the entire 2D loop has completed.
+  Prefetch2DArgs inner_args = args;
+  inner_args.prefetch.metric_collector_cb = nullptr;
+
+  for (size_t outer_idx = 0; outer_idx < outer_size;
+       outer_idx += args.outer_block) {
+    const size_t outer_end = (outer_size < outer_idx + args.outer_block)
+                                 ? outer_size
+                                 : outer_idx + args.outer_block;
+    cb.OnOuterBlockStart(outer_idx, outer_end);
+
+    for (size_t inner_idx = 0; inner_idx < inner_size;
+         inner_idx += args.inner_block) {
+      const size_t inner_end = (inner_size < inner_idx + args.inner_block)
+                                   ? inner_size
+                                   : inner_idx + args.inner_block;
+
+      cb.PrepareInnerBlock(outer_idx, outer_end, inner_idx, inner_end);
+
+      hwy::PrefetchPipelineLoop<Limits>(
+          outer_idx, outer_end,
+          [&](size_t i, auto& cachelines) {
+            cb.GetPrefetchCachelines(i, inner_idx, inner_end, cachelines);
+          },
+          [&](size_t i) { cb.ComputeTask(i, inner_idx, inner_end); },
+          inner_args.prefetch);
+    }
+
+    cb.OnOuterBlockFinish(outer_idx, outer_end);
+  }
+
+  if (HWY_UNLIKELY(args.prefetch.metric_collector_cb != nullptr)) {
+    args.prefetch.metric_collector_cb(args.prefetch.user_data,
+                                      hwy::timer::Stop() - t0);
+  }
+}
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_PIPELINE_2D_H_
diff --git a/hwy/contrib/pipeline/prefetch_pipeline_test.cc b/hwy/contrib/pipeline/prefetch_pipeline_test.cc
new file mode 100644
index 0000000000..063963e726
--- /dev/null
+++ b/hwy/contrib/pipeline/prefetch_pipeline_test.cc
@@ -0,0 +1,248 @@
+#include "hwy/contrib/pipeline/prefetch_pipeline.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "hwy/contrib/pipeline/prefetch_args.h"
+#include "hwy/tests/hwy_gtest.h"
+
+namespace hwy {
+namespace HWY_NAMESPACE {
+namespace {
+
+// A generic policy builder to parameterize the loop strategy for testing.
+template <PrefetchStrategy TargetStrategy>
+struct TestLimits : DefaultPrefetchLimits {
+  static constexpr PrefetchStrategy kStrategy = TargetStrategy;
+  static constexpr size_t kMaxCachelinesPerIter = 1;
+};
+
+// Global trace vector to track the exact chronological sequence of operations.
+std::vector<std::string> g_trace;
+
+void FakeDeepPrefetch(const void* ptr) {
+  // "PD" --> Deep Prefetch.
+  g_trace.push_back("PD(" + std::to_string(reinterpret_cast<size_t>(ptr)) +
+                    ")");
+}
+
+void FakeShallowPrefetch(const void* ptr) {
+  // "PS" --> Shallow Prefetch.
+  g_trace.push_back("PS(" + std::to_string(reinterpret_cast<size_t>(ptr)) +
+                    ")");
+}
+
+// A fake provider that pushes string events into the global trace.
+struct FakeCachelinesProvider {
+  template <size_t kMaxCachelinesPerIter>
+  void operator()(size_t i,
+                  PrefetchCachelines<kMaxCachelinesPerIter>& cachelines) const {
+    // "S" --> Supply cachelines.
+    g_trace.push_back("S(" + std::to_string(i) + ")");
+    // Add a dummy pointer so the `Prefetch` Assembly compiles cleanly natively.
+    cachelines.Add(reinterpret_cast<const void*>(i));
+  }
+};
+
+// A fake task to assert chronological execution order.
+struct FakeTask {
+  void operator()(size_t i) const {
+    // "T" --> Task.
+    g_trace.push_back("T(" + std::to_string(i) + ")");
+  }
+};
+
+class PrefetchPipelineTest : public ::testing::Test {
+ protected:
+  void SetUp() override { g_trace.clear(); }
+};
+
+template <PrefetchStrategy TargetStrategy, size_t Deep = 4, size_t Shallow = 2>
+void CallPipeline(size_t start, size_t end) {
+  PrefetchArgs args;
+  args.deep_lookahead = Deep;
+  args.shallow_lookahead = Shallow;
+  PrefetchPipelineLoop<TestLimits<TargetStrategy>, FakeCachelinesProvider,
+                       FakeTask, FakeDeepPrefetch, FakeShallowPrefetch>(
+      start, end, FakeCachelinesProvider(), FakeTask(), args);
+}
+
+TEST_F(PrefetchPipelineTest, NoPrefetchStrategy) {
+  CallPipeline<PrefetchStrategy::kNoPrefetch>(0, 5);
+
+  std::vector<std::string> expected = {"T(0)", "T(1)", "T(2)", "T(3)", "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, DualTierStrategy) {
+  // Dual-Tier relies on a deeply staggered Phase 1, Phase 2, Phase 3 pipeline.
+  // We test on an array of length 6. Lookaheads: kShallow = 2, kDeep = 4.
+  CallPipeline<PrefetchStrategy::kDualTier>(0, 6);
+
+  std::vector<std::string> expected = {
+      // Phase 1: Overlapping limits. L1/L3 horizons are primed (i = 0 to 1).
+      // Note: get_cachelines is called once per `i` and then distributed to
+      // deep/shallow.
+      "S(0)", "PD(0)", "PS(0)", "S(1)", "PD(1)", "PS(1)",
+      // Phase 2: Outstanding L3. L1 window is exhausted (i = 2 to 3).
+      "S(2)", "PD(2)", "S(3)", "PD(3)",
+      // Phase 3a: Main Sequence (i = 0 to 1).
+      "S(4)", "PD(4)", "S(2)", "PS(2)", "T(0)", "S(5)", "PD(5)", "S(3)",
+      "PS(3)", "T(1)",
+      // Phase 3b: Limit Deep (i = 2 to 3). Deep lookahead has reached array
+      // bounds.
+      "S(4)", "PS(4)", "T(2)", "S(5)", "PS(5)", "T(3)",
+      // Phase 3c: Drain (i = 4 to 5). No fetches remaining.
+      "T(4)", "T(5)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, DualTierStrategy_DifferentLookaheads) {
+  // Test with kShallow = 1, kDeep = 3 on an array of length 5.
+  CallPipeline<PrefetchStrategy::kDualTier, 3, 1>(0, 5);
+
+  std::vector<std::string> expected = {
+      // Phase 1: Overlapping limits. L1/L3 horizons are primed (i = 0 to 0).
+      "S(0)", "PD(0)", "PS(0)",
+      // Phase 2: Outstanding L3. L1 window is exhausted (i = 1 to 2).
+      "S(1)", "PD(1)", "S(2)", "PD(2)",
+      // Phase 3a: Main Sequence (i = 0 to 1).
+      // i=0 triggers deep+3 (3) and shallow+1 (1)
+      "S(3)", "PD(3)", "S(1)", "PS(1)", "T(0)",
+      // i=1 triggers deep+3 (4) and shallow+1 (2)
+      "S(4)", "PD(4)", "S(2)", "PS(2)", "T(1)",
+      // Phase 3b: Limit Deep (i = 2 to 3). Deep lookahead ends.
+      // i=2 shallow+1 (3)
+      "S(3)", "PS(3)", "T(2)",
+      // i=3 shallow+1 (4)
+      "S(4)", "PS(4)", "T(3)",
+      // Phase 3c: Drain (i = 4 to 4). No fetches remaining.
+      "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, ShallowRollingLookaheadStrategy) {
+  // Tests the 1D rolling array. Only shallow lookahead is active (kShallow=2).
+  CallPipeline<PrefetchStrategy::kShallowLookaheadOnly>(0, 6);
+
+  std::vector<std::string> expected = {// Startup Phase (i = 0 to 1) for L1
+                                       "S(0)", "PS(0)", "S(1)", "PS(1)",
+                                       // Main Sliding Loop (i = 0 to 3)
+                                       "S(2)", "PS(2)", "T(0)", "S(3)", "PS(3)",
+                                       "T(1)", "S(4)", "PS(4)", "T(2)", "S(5)",
+                                       "PS(5)", "T(3)",
+                                       // Drain (i = 4 to 5)
+                                       "T(4)", "T(5)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, DeepRollingLookaheadStrategy) {
+  // Tests the 1D rolling array using the Deep boundary (kDeep=4).
+  CallPipeline<PrefetchStrategy::kDeepLookaheadOnly>(0, 6);
+
+  std::vector<std::string> expected = {
+      // Startup Phase (i = 0 to 3) for L3 (kDeep = 4)
+      "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)", "PD(2)", "S(3)", "PD(3)",
+      // Main Sliding Loop (i = 0 to 1)
+      "S(4)", "PD(4)", "T(0)", "S(5)", "PD(5)", "T(1)",
+      // Drain (i = 2 to 5)
+      "T(2)", "T(3)", "T(4)", "T(5)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, MiniBatchShallowStrategy) {
+  // Tests the blocked/batch prefetch array. Shallow lookahead = 2 = Batch size.
+  CallPipeline<PrefetchStrategy::kMiniBatchShallow>(0, 5);
+
+  std::vector<std::string> expected = {
+      // Block 1 (i = 0 to 1)
+      "S(0)", "PS(0)", "S(1)", "PS(1)", "T(0)", "T(1)",
+      // Block 2 (i = 2 to 3)
+      "S(2)", "PS(2)", "S(3)", "PS(3)", "T(2)", "T(3)",
+      // Block 3 (remainder, i = 4 to 4)
+      "S(4)", "PS(4)", "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, MiniBatchDeepStrategy) {
+  // Tests the blocked/batch prefetch array. Deep lookahead = 4 = Batch size.
+  CallPipeline<PrefetchStrategy::kMiniBatchDeep>(0, 5);
+
+  std::vector<std::string> expected = {// Block 1 (i = 0 to 3)
+                                       "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)",
+                                       "PD(2)", "S(3)", "PD(3)", "T(0)", "T(1)",
+                                       "T(2)", "T(3)",
+                                       // Block 2 (remainder, i = 4 to 4)
+                                       "S(4)", "PD(4)", "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, ZeroShallowZeroDeepFallback) {
+  // Disabling both tiers should degrade down to NoPrefetch behavior entirely.
+  CallPipeline<PrefetchStrategy::kDualTier, 0, 0>(0, 5);
+
+  std::vector<std::string> expected = {"T(0)", "T(1)", "T(2)", "T(3)", "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, ZeroShallowFallback) {
+  // If shallow is 0, we degrade to pure DeepOnly logic (e.g. Rolling L3 Only)
+  CallPipeline<PrefetchStrategy::kDualTier, 3, 0>(0, 4);
+
+  std::vector<std::string> expected = {
+      // Startup Deep
+      "S(0)", "PD(0)", "S(1)", "PD(1)", "S(2)", "PD(2)",
+      // Sliding Loop
+      "S(3)", "PD(3)", "T(0)",
+      // Drain
+      "T(1)", "T(2)", "T(3)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+TEST_F(PrefetchPipelineTest, ShallowGreaterOrEqualDeepFallback) {
+  // If shallow >= deep, the L3 tier is bypassed entirely protecting LFBs.
+  // Tests DualTier logic natively degrading to Rolling L1-only behavior.
+  CallPipeline<PrefetchStrategy::kDualTier, 2, 4>(0, 5);
+
+  std::vector<std::string> expected = {
+      // Startup Shallow (for 4 steps)
+      "S(0)", "PS(0)", "S(1)", "PS(1)", "S(2)", "PS(2)", "S(3)", "PS(3)",
+      // Sliding Loop
+      "S(4)", "PS(4)", "T(0)",
+      // Drain
+      "T(1)", "T(2)", "T(3)", "T(4)"};
+  EXPECT_EQ(g_trace, expected);
+}
+
+struct TestMetricContext {
+  bool called = false;
+  uint64_t ticks = 0;
+};
+
+void FakeMetricCollectorCb(void* user_data, uint64_t elapsed_ticks) {
+  auto* ctx = static_cast<TestMetricContext*>(user_data);
+  ctx->called = true;
+  ctx->ticks = elapsed_ticks;
+}
+
+TEST_F(PrefetchPipelineTest, MetricCollectorCallback) {
+  TestMetricContext ctx;
+  PrefetchArgs args;
+  args.metric_collector_cb = FakeMetricCollectorCb;
+  args.user_data = &ctx;
+
+  PrefetchPipelineLoop<TestLimits<PrefetchStrategy::kNoPrefetch>,
+                       FakeCachelinesProvider, FakeTask, FakeDeepPrefetch,
+                       FakeShallowPrefetch>(0, 5, FakeCachelinesProvider(),
+                                            FakeTask(), args);
+
+  EXPECT_TRUE(ctx.called);
+  // Elapsed ticks can be small, but it guarantees the callback was fully fired.
+}
+
+}  // namespace
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
diff --git a/hwy_tests.bzl b/hwy_tests.bzl
index 8a3eefb48d..1d3cd1122c 100644
--- a/hwy_tests.bzl
+++ b/hwy_tests.bzl
@@ -62,6 +62,13 @@ HWY_CONTRIB_TESTS = (
         "math_hyper_test",
         [":math"],
     ),
+    # copybara:strip_begin(internal)
+    (
+        "hwy/contrib/pipeline/",
+        "prefetch_pipeline_test",
+        [":prefetch_pipeline"],
+    ),
+    # copybara:strip_end
     (
         "hwy/contrib/math/",
         "math_tan_test",