diff --git a/src/v/config/configuration.cc b/src/v/config/configuration.cc index 2a4440b370c04..f742354c29f9c 100644 --- a/src/v/config/configuration.cc +++ b/src/v/config/configuration.cc @@ -4835,6 +4835,12 @@ configuration::configuration() {.needs_restart = needs_restart::yes, .visibility = visibility::tunable}, 32, {.min = 1}) + , code_hugepages_enabled( + *this, + "code_hugepages_enabled", + "Map the binary into hugepages", + {.needs_restart = needs_restart::no, .visibility = visibility::tunable}, + false) , development_feature_property_testing_only( *this, "development_feature_property_testing_only", diff --git a/src/v/config/configuration.h b/src/v/config/configuration.h index da63ff75a15cc..caef8b1fc1074 100644 --- a/src/v/config/configuration.h +++ b/src/v/config/configuration.h @@ -837,6 +837,8 @@ struct configuration final : public config_store { bounded_property cloud_topics_produce_write_inflight_limit; bounded_property cloud_topics_produce_no_pid_concurrency; + property code_hugepages_enabled; + development_feature_property development_feature_property_testing_only; private: diff --git a/src/v/redpanda/BUILD b/src/v/redpanda/BUILD index c58ae27aa41f3..f0e68bff9c587 100644 --- a/src/v/redpanda/BUILD +++ b/src/v/redpanda/BUILD @@ -144,14 +144,20 @@ redpanda_cc_binary( srcs = [ "main.cc", ], - linkopts = - select({ - ":use_emit_relocs": [ - "-Wl,--emit-relocs", - ], - "//conditions:default": [ - ], - }), + linkopts = [ + # Align loadable segments to 2 MB so the text segment is eligible for + # transparent huge pages (PMD-mapped). separate-loadable-segments + # ensures each PT_LOAD gets its own mmap at a 2 MB boundary rather + # than packing segments into a single mapping. + "-Wl,-z,max-page-size=2097152", + "-Wl,-z,separate-loadable-segments", + ] + select({ + ":use_emit_relocs": [ + "-Wl,--emit-relocs", + ], + "//conditions:default": [ + ], + }), visibility = ["//visibility:public"], deps = [ ":application", diff --git a/src/v/redpanda/application.cc b/src/v/redpanda/application.cc index ada68c2c909f1..3ea76deecb67c 100644 --- a/src/v/redpanda/application.cc +++ b/src/v/redpanda/application.cc @@ -45,6 +45,7 @@ #include "security/audit/audit_log_manager.h" #include "storage/api.h" #include "storage/directories.h" +#include "syschecks/hugepages.h" #include "syschecks/syschecks.h" #include "utils/file_io.h" #include "utils/human.h" @@ -792,6 +793,18 @@ void application::check_environment() { syschecks::systemd_message("checking environment (CPU, Mem)").get(); syschecks::cpu(); syschecks::memory(config::node().developer_mode()); + if (config::shard_local_cfg().code_hugepages_enabled()) { + syschecks::promote_code_to_hugepages(); + } + _code_hugepages_binding.emplace( + config::shard_local_cfg().code_hugepages_enabled.bind()); + _code_hugepages_binding->watch([this] { + if ((*_code_hugepages_binding)()) { + syschecks::promote_code_to_hugepages(); + } else { + syschecks::demote_code_from_hugepages(); + } + }); memory_groups().log_memory_group_allocations(_log); storage::directories::initialize( config::node().data_directory().as_sstring()) diff --git a/src/v/redpanda/application.h b/src/v/redpanda/application.h index 0a670deadf1ab..c71f304429708 100644 --- a/src/v/redpanda/application.h +++ b/src/v/redpanda/application.h @@ -292,6 +292,7 @@ class application : public ssx::sharded_service_container { ss::sharded _scheduling_groups_probe; std::optional> _abort_on_oom; + std::optional> _code_hugepages_binding; ss::sharded _memory_sampling; ss::sharded _rpc; diff --git a/src/v/syschecks/BUILD b/src/v/syschecks/BUILD index 50db3879cb6b0..d8f820ca67bbf 100644 --- a/src/v/syschecks/BUILD +++ b/src/v/syschecks/BUILD @@ -3,10 +3,12 @@ load("//bazel:build.bzl", "redpanda_cc_library") redpanda_cc_library( name = "syschecks", srcs = [ + "hugepages.cc", "pidfile.cc", "syschecks.cc", ], hdrs = [ + "hugepages.h", "syschecks.h", ], visibility = ["//visibility:public"], diff --git a/src/v/syschecks/hugepages.cc b/src/v/syschecks/hugepages.cc new file mode 100644 index 0000000000000..46f979845fafb --- /dev/null +++ b/src/v/syschecks/hugepages.cc @@ -0,0 +1,137 @@ +// Copyright 2026 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#include "syschecks/hugepages.h" + +#include "base/vlog.h" +#include "syschecks/syschecks.h" + +#include + +#include +#include + +// MADV_COLLAPSE was added in Linux 6.1. Define it for older headers. +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +namespace syschecks { + +namespace { + +/// Invoke fn(addr, len) for each non-writable PT_LOAD segment across +/// all loaded ELF objects (main binary + shared libraries). This covers +/// .text (PF_R|PF_X) and .rodata (PF_R) segments. +template +void for_each_ro_segment(Fn fn) { + dl_iterate_phdr( + [](struct dl_phdr_info* info, size_t /*size*/, void* data) -> int { + auto& callback = *static_cast(data); + for (int i = 0; i < info->dlpi_phnum; ++i) { + const auto& phdr = info->dlpi_phdr[i]; + if (phdr.p_type != PT_LOAD) { + continue; + } + // Skip writable segments (.data, .bss). + if (phdr.p_flags & PF_W) { + continue; + } + auto addr = info->dlpi_addr + phdr.p_vaddr; + auto len = phdr.p_memsz; + if (len == 0) { + continue; + } + callback(reinterpret_cast(addr), static_cast(len)); + } + return 0; // continue iteration + }, + &fn); +} + +} // namespace + +void promote_code_to_hugepages() { + size_t total_bytes = 0; + size_t marked_bytes = 0; + size_t collapsed_bytes = 0; + + for_each_ro_segment([&](void* addr, size_t len) { + total_bytes += len; + + // Mark the VMA for huge pages. In "madvise" THP mode (the common + // default), khugepaged only scans VMAs with VM_HUGEPAGE set, so this + // is required for ongoing huge page maintenance — not just a hint. + if (::madvise(addr, len, MADV_HUGEPAGE) == 0) { + marked_bytes += len; + } + + // Fault in all pages so MADV_COLLAPSE has something to work with. + // At startup most pages are still demand-paged. + // In theory this is not needed with MADV_COLLAPSE but the docs leave a + // cop out so we are just explicit in any case. + // Incompatible with ASAN, disable if on +#if !__has_feature(address_sanitizer) + auto* base = static_cast(addr); + for (size_t off = 0; off < len; off += 4096) { + [[maybe_unused]] char c = base[off]; + } +#endif + + // Synchronously collapse 4 KB pages into 2 MB huge pages + // (Linux 6.1+). Without this, khugepaged promotes pages in the + // background over the next few seconds; MADV_COLLAPSE makes it + // immediate (best effort). + if (::madvise(addr, len, MADV_COLLAPSE) == 0) { + collapsed_bytes += len; + } + }); + + if (total_bytes > 0) { + vlog( + checklog.info, + "hugepages: {}/{} MiB marked, {}/{} MiB collapsed", + marked_bytes / (1024 * 1024), + total_bytes / (1024 * 1024), + collapsed_bytes / (1024 * 1024), + total_bytes / (1024 * 1024)); + } +} + +void demote_code_from_hugepages() { + size_t total_bytes = 0; + size_t demoted_bytes = 0; + + for_each_ro_segment([&](void* addr, size_t len) { + total_bytes += len; + + // Prevent khugepaged from re-promoting these pages. + if (::madvise(addr, len, MADV_NOHUGEPAGE) != 0) { + return; + } + + // MADV_NOHUGEPAGE only prevents future promotions — existing PMD + // entries for file-backed pages are not split. MADV_DONTNEED drops + // the page table entries. They will be re-faulted at 4 KB granularity + // (since MADV_NOHUGEPAGE is set). + if (::madvise(addr, len, MADV_DONTNEED) == 0) { + demoted_bytes += len; + } + }); + + if (total_bytes > 0) { + vlog( + checklog.info, + "hugepages: demoted {}/{} MiB from huge pages", + demoted_bytes / (1024 * 1024), + total_bytes / (1024 * 1024)); + } +} + +} // namespace syschecks diff --git a/src/v/syschecks/hugepages.h b/src/v/syschecks/hugepages.h new file mode 100644 index 0000000000000..77a6c87b54687 --- /dev/null +++ b/src/v/syschecks/hugepages.h @@ -0,0 +1,24 @@ +/* + * Copyright 2026 Redpanda Data, Inc. + * + * Use of this software is governed by the Business Source License + * included in the file licenses/BSL.md + * + * As of the Change Date specified in that file, in accordance with + * the Business Source License, use of this software will be governed + * by the Apache License, Version 2.0 + */ + +#pragma once + +namespace syschecks { + +/// Promote file-backed executable mappings (code segments) to transparent huge +/// pages. +void promote_code_to_hugepages(); + +/// Undo the effect of promote_code_to_hugepages(). Marks executable VMAs with +/// MADV_NOHUGEPAGE +void demote_code_from_hugepages(); + +} // namespace syschecks