Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -702,3 +702,43 @@ test_suite(
name = "hwy_ops_tests",
tags = ["hwy_ops_test"],
)

# copybara:strip_begin(internal)
cc_library(
name = "dual_tier_prefetch",
hdrs = [
"hwy/contrib/pipeline/dual_tier_prefetch.h",
],
compatible_with = [],
copts = COPTS,
deps = [
":hwy",
],
)

cc_library(
name = "dual_tier_prefetch_2d",
hdrs = [
"hwy/contrib/pipeline/dual_tier_prefetch_2d.h",
],
compatible_with = [],
copts = COPTS,
deps = [
":dual_tier_prefetch",
":hwy",
],
)

cc_test(
name = "dual_tier_prefetch_benchmark",
size = "medium",
srcs = ["hwy/contrib/pipeline/dual_tier_prefetch_benchmark.cc"],
copts = COPTS + HWY_TEST_COPTS,
deps = [
":dual_tier_prefetch",
":hwy",
"//testing/base/public:gunit_main",
"//third_party/benchmark",
],
)
# copybara:strip_end
38 changes: 36 additions & 2 deletions hwy/cache_control.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#endif
}

// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
// Optionally begins loading the cache line containing "p" into all cache
// levels, including L1, to reduce latency of subsequent actual loads. This
// corresponds to the T0 temporal locality hint on x86, which is ideal when data
// is about to be directly consumed.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
(void)p;
Expand All @@ -109,6 +111,38 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Begins loading the cache line containing "p" into the L1 cache only, passing
// a Non-Temporal Access (NTA) hint. This minimizes pollution of outer memory
// caches (L2/L3) and is ideal for data accessed exactly once.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void PrefetchNTA(const T* p) {
(void)p;
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_NTA);
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang
// Hint=0 specifically sets Non-Temporal local locality
__builtin_prefetch(p, /*write=*/0, /*hint=*/0);
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Attempts to stage the cache line containing "p" into the L3/L2 outer caches
// without aggressively staging it immediately into the L1. This restricts L1
// and LFB thrashing on architectures like Intel when hiding massive DRAM delay.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void DeepPrefetch(const T* p) {
(void)p;
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T2);
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang
// Hint=1 requests Moderate degrees of temporal locality (L2/L3 bounds)
__builtin_prefetch(p, /*write=*/0, /*hint=*/1);
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Invalidates and flushes the cache line containing "p", if possible.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
Expand Down
Loading