Skip to content
Open

N/A #3105

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 46 additions & 21 deletions BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ load("@rules_cc//cc:cc_test.bzl", "cc_test")
load("@bazel_skylib//lib:selects.bzl", "selects")
load("//:hwy_tests.bzl", "HWY_TESTS")
load("@rules_license//rules:license.bzl", "license")
load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library")

package(
default_applicable_licenses = [":license"],
Expand Down Expand Up @@ -118,6 +119,27 @@ COPTS = select({
],
})

HWY_TEST_COPTS = select({
":compiler_msvc": [],
"//conditions:default": [
# gTest triggers this warning (which is enabled by the
# extra-semi in COPTS), so we need to disable it here,
# but it's still enabled for :hwy.
"-Wno-c++98-compat-extra-semi",
],
})

# Common to all tests.
HWY_TEST_DEPS = [
":hwy_test_util",
":hwy",
":nanobenchmark",
":timer",
] + select({
":compiler_msvc": [],
"//conditions:default": ["@com_google_googletest//:gtest_main"],
})

DEFINES = select({
":compiler_msvc": ["HWY_SHARED_DEFINE"],
":compiler_clangcl": ["HWY_SHARED_DEFINE"],
Expand Down Expand Up @@ -606,6 +628,23 @@ cc_library(
],
)

# copybara:strip_begin(internal)
cc_library(
name = "prefetch_pipeline",
hdrs = [
"hwy/contrib/pipeline/prefetch_args.h",
"hwy/contrib/pipeline/prefetch_pipeline.h",
"hwy/contrib/pipeline/prefetch_pipeline_2d.h",
],
compatible_with = [],
copts = COPTS,
deps = [
":hwy",
":timer",
],
)
# copybara:strip_end

cc_test(
name = "list_targets",
size = "small",
Expand All @@ -627,27 +666,6 @@ cc_test(
],
)

HWY_TEST_COPTS = select({
":compiler_msvc": [],
"//conditions:default": [
# gTest triggers this warning (which is enabled by the
# extra-semi in COPTS), so we need to disable it here,
# but it's still enabled for :hwy.
"-Wno-c++98-compat-extra-semi",
],
})

# Common to all tests.
HWY_TEST_DEPS = [
":hwy_test_util",
":hwy",
":nanobenchmark",
":timer",
] + select({
":compiler_msvc": [],
"//conditions:default": ["@com_google_googletest//:gtest_main"],
})

[
[
cc_test(
Expand Down Expand Up @@ -713,3 +731,10 @@ test_suite(
name = "hwy_ops_tests",
tags = ["hwy_ops_test"],
)

bzl_library(
name = "hwy_tests_bzl",
srcs = ["hwy_tests.bzl"],
parse_tests = False,
visibility = ["//visibility:private"],
)
38 changes: 36 additions & 2 deletions hwy/cache_control.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#endif
}

// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
// Optionally begins loading the cache line containing "p" into all cache
// levels, including L1, to reduce latency of subsequent actual loads. This
// corresponds to the T0 temporal locality hint on x86, which is ideal when data
// is about to be directly consumed.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
(void)p;
Expand All @@ -109,6 +111,38 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Begins loading the cache line containing "p" into the L1 cache only, passing
// a Non-Temporal Access (NTA) hint. This minimizes pollution of outer memory
// caches (L2/L3) and is ideal for data accessed exactly once.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void ShallowPrefetch(const T* p) {
(void)p;
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_NTA);
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang
// Hint=0 specifically sets Non-Temporal local locality
__builtin_prefetch(p, /*write=*/0, /*hint=*/0);
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Attempts to stage the cache line containing "p" into the L3/L2 outer caches
// without aggressively staging it immediately into the L1. This restricts L1
// and LFB thrashing on architectures like Intel when hiding massive DRAM delay.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void DeepPrefetch(const T* p) {
(void)p;
#ifndef HWY_DISABLE_CACHE_CONTROL
#if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T2);
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang
// Hint=1 requests Moderate degrees of temporal locality (L2/L3 bounds)
__builtin_prefetch(p, /*write=*/0, /*hint=*/1);
#endif
#endif // HWY_DISABLE_CACHE_CONTROL
}

// Invalidates and flushes the cache line containing "p", if possible.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
Expand Down
114 changes: 114 additions & 0 deletions hwy/contrib/pipeline/prefetch_args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright 2026 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_
#define HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_

#include <stddef.h>
#include <stdint.h>

namespace hwy {

struct PrefetchArgs {
// The iteration distance (in loop iterations) to look ahead for the deep L3
// prefetch. If 0, no deep prefetch will be issued.
size_t deep_lookahead = 32;

// The iteration distance (in loop iterations) to look ahead for the shallow
// L1 prefetch. If 0, no shallow prefetch will be issued.
size_t shallow_lookahead = 4;

// -------------------------------------------------------------------------
// Telemetry Conduit
// -------------------------------------------------------------------------
// A generic callback executed upon pipeline completion to report performance
// metrics (typically elapsed time). Its signature avoids std::function to
// maintain zero-overhead C-style linkage and ensure the struct remains
// trivially copyable without heap allocations.
// user_data: Custom context pointer returned to the callback.
// elapsed_ticks: Raw cycle ticks taken to evaluate the pipeline loop.
void (*metric_collector_cb)(void* user_data,
uint64_t elapsed_ticks) = nullptr;
void* user_data = nullptr;

// -------------------------------------------------------------------------
// Safe Default Factories
// -------------------------------------------------------------------------
// Tuning memory prefetching is notoriously difficult because lookahead bounds
// change dramatically depending on the spatial distribution of the workload.

// Random Access / Scatter-Gather (e.g. Hash Table Probing, Graph Walks)
//
// Random array accesses constantly suffer TLB (Translation Lookaside Buffer)
// misses, resulting in massive Page Walk delays. To absorb these colossal
// ~300-cycle stalls natively inside the L3 queue, the deep lookahead must
// aggressively stretch out by large margins (e.g. 32-48 iterations).
static constexpr PrefetchArgs DefaultRandom() {
#if HWY_ARCH_ARM_A64
return PrefetchArgs{.deep_lookahead = 64, .shallow_lookahead = 8};
#else
return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4};
#endif
}

// Sequential Scans / Linear Memory (e.g. Matrix Vector, Filter Scans)
//
// Linear accesses benefit intimately from native CPU stream-trackers (which
// already mask bulk DRAM latency). Here, a heavy L3 lookahead is
// counter-productive; it merely crowds the queue. Instead, we tighten the
// lookaheads down to safely bridge the narrower L3 -> L1 latency gap
// (~40 cycles) without overflowing LFBs during heavy SIMD evaluation.
static constexpr PrefetchArgs DefaultSequential() {
#if HWY_ARCH_ARM_A64
return PrefetchArgs{.deep_lookahead = 32, .shallow_lookahead = 4};
#else
return PrefetchArgs{.deep_lookahead = 8, .shallow_lookahead = 2};
#endif
}
};

// ---------------------------------------------------------------------------
// 2D-Tiled Prefetch Policy
// ---------------------------------------------------------------------------
// Extends the base PrefetchPolicy to include standard 2D tiling constants.
struct Prefetch2DArgs {
PrefetchArgs prefetch;
size_t outer_block = 128;
size_t inner_block = 256;

// -------------------------------------------------------------------------
// Safe Default Factories
// -------------------------------------------------------------------------

static constexpr Prefetch2DArgs DefaultRandom() {
Prefetch2DArgs args;
args.prefetch = PrefetchArgs::DefaultRandom();
args.outer_block = 128;
args.inner_block = 256;
return args;
}

static constexpr Prefetch2DArgs DefaultSequential() {
Prefetch2DArgs args;
args.prefetch = PrefetchArgs::DefaultSequential();
args.outer_block = 256;
args.inner_block = 512;
return args;
}
};

} // namespace hwy

#endif // HIGHWAY_HWY_CONTRIB_PIPELINE_PREFETCH_ARGS_H_
Loading
Loading