From 35a0009dd608a8b944d9773aeed93b7e4d7d8413 Mon Sep 17 00:00:00 2001 From: Thomas Date: Sat, 30 May 2026 21:43:00 +0200 Subject: [PATCH] optionally use MADV_GUARD_INSTALL for large allocation guard pages Add CONFIG_GUARD_PAGES_USE_MADVISE (default false) to install large allocation guard regions with MADV_GUARD_INSTALL (Linux 6.13+) inside a single read-write mapping instead of separate PROT_NONE mappings, keeping each large allocation to one VMA instead of three. This is preserved through allocate_pages(), allocate_pages_aligned(), the region quarantine and in-place realloc shrink so it holds under allocation churn, including under CONFIG_LABEL_MEMORY where the quarantined region is named as a whole to avoid splitting the VMA. Kernel support is probed and cached at runtime. Guard installation is best-effort: it falls back to the PROT_NONE scheme whenever madvise fails, including EINVAL on VM_LOCKED mappings from mlockall(MCL_FUTURE), which also latches the feature off to avoid retrying on every allocation. It is off by default because the guard bytes are then accounted as committed memory (resident memory and total address space are unchanged), which regresses strict overcommit (vm.overcommit_memory=2). --- Makefile | 9 ++++-- README.md | 13 +++++++++ config/default.mk | 1 + config/light.mk | 1 + h_malloc.c | 8 ++++-- memory.c | 60 +++++++++++++++++++++++++++++++++++++++ memory.h | 8 ++++++ pages.c | 72 ++++++++++++++++++++++++++++++++--------------- 8 files changed, 146 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 7ad20e87..f0ad3b8b 100644 --- a/Makefile +++ b/Makefile @@ -93,6 +93,10 @@ ifeq (,$(filter $(CONFIG_LABEL_MEMORY),true false)) $(error CONFIG_LABEL_MEMORY must be true or false) endif +ifeq (,$(filter $(CONFIG_GUARD_PAGES_USE_MADVISE),true false)) + $(error CONFIG_GUARD_PAGES_USE_MADVISE must be true or false) +endif + CPPFLAGS += \ -DCONFIG_SEAL_METADATA=$(CONFIG_SEAL_METADATA) \ -DZERO_ON_FREE=$(CONFIG_ZERO_ON_FREE) \ @@ -113,7 +117,8 @@ CPPFLAGS += \ -DN_ARENA=$(CONFIG_N_ARENA) \ -DCONFIG_STATS=$(CONFIG_STATS) \ -DCONFIG_SELF_INIT=$(CONFIG_SELF_INIT) \ - -DCONFIG_LABEL_MEMORY=$(CONFIG_LABEL_MEMORY) + -DCONFIG_LABEL_MEMORY=$(CONFIG_LABEL_MEMORY) \ + -DGUARD_PAGES_USE_MADVISE=$(CONFIG_GUARD_PAGES_USE_MADVISE) $(OUT)/libhardened_malloc$(SUFFIX).so: $(OBJECTS) | $(OUT) $(CC) $(CFLAGS) $(LDFLAGS) -shared $^ $(LDLIBS) -o $@ @@ -125,7 +130,7 @@ $(OUT)/chacha.o: chacha.c chacha.h util.h $(CONFIG_FILE) | $(OUT) $(COMPILE.c) $(OUTPUT_OPTION) $< $(OUT)/h_malloc.o: h_malloc.c include/h_malloc.h mutex.h memory.h pages.h random.h util.h $(CONFIG_FILE) | $(OUT) $(COMPILE.c) $(OUTPUT_OPTION) $< -$(OUT)/memory.o: memory.c memory.h util.h $(CONFIG_FILE) | $(OUT) +$(OUT)/memory.o: memory.c memory.h pages.h util.h $(CONFIG_FILE) | $(OUT) $(COMPILE.c) $(OUTPUT_OPTION) $< $(OUT)/new.o: new.cc include/h_malloc.h util.h $(CONFIG_FILE) | $(OUT) $(COMPILE.cc) $(OUTPUT_OPTION) $< diff --git a/README.md b/README.md index 7ffe1142..d37f4a98 100644 --- a/README.md +++ b/README.md @@ -333,6 +333,19 @@ The following integer configuration options are available: allocations use the slab allocation size class scheme instead of page size granularity. See the [section on size classes](#size-classes) below for details. +* `CONFIG_GUARD_PAGES_USE_MADVISE`: `false` (default) to control whether the + guard regions for large allocations are created with `MADV_GUARD_INSTALL` + (Linux 6.13+) inside a single read-write mapping instead of separate + `PROT_NONE` mappings. This keeps each large allocation to a single VMA instead + of three, which substantially reduces VMA pressure on systems making many + large allocations. The kernel feature is probed at runtime with a fallback to + the `PROT_NONE` scheme when it is unavailable; the probe trusts the `madvise` + return value, so the feature must be validated on a real kernel (qemu-user, for + example, silently ignores the advice). It is off by default because + the guarded bytes are then accounted as committed memory (resident memory and + the total address space are unchanged, since the guard pages are never backed + and the same amount of address space is reserved either way), which can be a + problem under strict overcommit (`vm.overcommit_memory=2`). There will be more control over enabled features in the future along with control over fairly arbitrarily chosen values like the size of empty slab diff --git a/config/default.mk b/config/default.mk index d8f03a78..38955564 100644 --- a/config/default.mk +++ b/config/default.mk @@ -22,3 +22,4 @@ CONFIG_N_ARENA := 4 CONFIG_STATS := false CONFIG_SELF_INIT := true CONFIG_LABEL_MEMORY := false +CONFIG_GUARD_PAGES_USE_MADVISE := false diff --git a/config/light.mk b/config/light.mk index da8676d9..92197a30 100644 --- a/config/light.mk +++ b/config/light.mk @@ -22,3 +22,4 @@ CONFIG_N_ARENA := 4 CONFIG_STATS := false CONFIG_SELF_INIT := true CONFIG_LABEL_MEMORY := false +CONFIG_GUARD_PAGES_USE_MADVISE := false diff --git a/h_malloc.c b/h_malloc.c index be66d964..6064c47c 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -1019,10 +1019,13 @@ static void regions_quarantine_deallocate_pages(void *p, size_t size, size_t gua return; } - if (unlikely(memory_map_fixed(p, size))) { + if (unlikely(memory_guard_or_protnone(p, size))) { if (unlikely(memory_purge(p, size))) { memset(p, 0, size); } + } else if (GUARD_PAGES_USE_MADVISE) { + // name the whole region so labeling (PR_SET_VMA_ANON_NAME) keeps it a single VMA + memory_set_name((char *)p - guard_size, size + guard_size * 2, "malloc large quarantine"); } else { memory_set_name(p, size, "malloc large quarantine"); } @@ -1596,7 +1599,7 @@ EXPORT void *h_realloc(void *old, size_t size) { // in-place shrink if (size < old_size) { void *new_end = (char *)old + size; - if (memory_map_fixed(new_end, old_guard_size)) { + if (memory_guard_or_protnone(new_end, old_guard_size)) { thread_seal_metadata(); return NULL; } @@ -1619,6 +1622,7 @@ EXPORT void *h_realloc(void *old, size_t size) { #ifdef HAVE_COMPATIBLE_MREMAP static const bool vma_merging_reliable = false; + // not updated for the madvise guard scheme; revisit guard handling before enabling if (vma_merging_reliable) { // in-place growth void *guard_end = (char *)old + old_size + old_guard_size; diff --git a/memory.c b/memory.c index 45826f38..b63a080c 100644 --- a/memory.c +++ b/memory.c @@ -1,9 +1,14 @@ #include +#include #include #include +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + #ifndef PR_SET_VMA #define PR_SET_VMA 0x53564d41 #endif @@ -13,6 +18,7 @@ #endif #include "memory.h" +#include "pages.h" #include "util.h" static void *memory_map_prot(size_t size, int prot) { @@ -30,6 +36,10 @@ void *memory_map(size_t size) { return memory_map_prot(size, PROT_NONE); } +void *memory_map_rw(size_t size) { + return memory_map_prot(size, PROT_READ|PROT_WRITE); +} + #ifdef HAS_ARM_MTE // Note that PROT_MTE can't be cleared via mprotect void *memory_map_mte(size_t size) { @@ -117,6 +127,56 @@ bool memory_purge(void *ptr, size_t size) { return ret; } +// 0 = unknown, 1 = supported, -1 = unsupported/disabled +static atomic_int guard_install_state; + +// EINVAL means the mapping can't be guarded (VM_LOCKED via mlockall(MCL_FUTURE), which locks +// all future mappings), so latch the feature off; ENOMEM/EINTR are transient +bool memory_guard_install(void *ptr, size_t size) { + int saved_errno = errno; + if (likely(madvise(ptr, size, MADV_GUARD_INSTALL) == 0)) { + return false; + } + if (errno == EINVAL) { + atomic_store_explicit(&guard_install_state, -1, memory_order_relaxed); + } + errno = saved_errno; + return true; +} + +bool memory_guard_install_supported(void) { + int s = atomic_load_explicit(&guard_install_state, memory_order_relaxed); + if (likely(s)) { + return s > 0; + } + int saved_errno = errno; + void *p = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + // a transient mmap failure must not be cached as "unsupported"; leave the state unknown + if (p == MAP_FAILED) { + errno = saved_errno; + return false; + } + s = madvise(p, PAGE_SIZE, MADV_GUARD_INSTALL) == 0 ? 1 : -1; + munmap(p, PAGE_SIZE); + errno = saved_errno; + // don't clobber a concurrent EINVAL latch from memory_guard_install + int expected = 0; + if (!atomic_compare_exchange_strong_explicit(&guard_install_state, &expected, s, + memory_order_relaxed, memory_order_relaxed)) { + s = expected; + } + return s > 0; +} + +// guard the range in place to keep it within a single VMA when the madvise scheme is active, +// falling back to a PROT_NONE remap on failure or when the scheme is off +bool memory_guard_or_protnone(void *ptr, size_t size) { + if (GUARD_PAGES_USE_MADVISE && memory_guard_install_supported()) { + return memory_guard_install(ptr, size) && memory_map_fixed(ptr, size); + } + return memory_map_fixed(ptr, size); +} + bool memory_set_name(UNUSED void *ptr, UNUSED size_t size, UNUSED const char *name) { if (CONFIG_LABEL_MEMORY) { return prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name); diff --git a/memory.h b/memory.h index b1156386..df605e0b 100644 --- a/memory.h +++ b/memory.h @@ -12,9 +12,17 @@ #define CONFIG_LABEL_MEMORY false #endif +#ifndef GUARD_PAGES_USE_MADVISE +#define GUARD_PAGES_USE_MADVISE false +#endif + int get_metadata_key(void); void *memory_map(size_t size); +void *memory_map_rw(size_t size); +bool memory_guard_install(void *ptr, size_t size); +bool memory_guard_install_supported(void); +bool memory_guard_or_protnone(void *ptr, size_t size); #ifdef HAS_ARM_MTE void *memory_map_mte(size_t size); #endif diff --git a/pages.c b/pages.c index 26045ce3..3e0192a7 100644 --- a/pages.c +++ b/pages.c @@ -16,6 +16,22 @@ void *allocate_pages(size_t usable_size, size_t guard_size, bool unprotect, cons errno = ENOMEM; return NULL; } + // MADV_GUARD_INSTALL needs page-aligned ranges + if (GUARD_PAGES_USE_MADVISE && unprotect && (usable_size & (PAGE_SIZE - 1)) == 0 && + memory_guard_install_supported()) { + void *guarded = memory_map_rw(real_size); + if (likely(guarded != NULL)) { + memory_set_name(guarded, real_size, name); + void *usable = (char *)guarded + guard_size; + // on guard-install failure, fall back to the PROT_NONE scheme below + if (!guard_size || (!memory_guard_install(guarded, guard_size) && + !memory_guard_install((char *)usable + usable_size, guard_size))) { + return usable; + } + memory_unmap(guarded, real_size); + } + } + void *real = memory_map(real_size); if (unlikely(real == NULL)) { return NULL; @@ -48,38 +64,50 @@ void *allocate_pages_aligned(size_t usable_size, size_t alignment, size_t guard_ return NULL; } - void *real = memory_map(real_alloc_size); - if (unlikely(real == NULL)) { - return NULL; - } - memory_set_name(real, real_alloc_size, name); + bool use_madvise = GUARD_PAGES_USE_MADVISE && memory_guard_install_supported(); - void *usable = (char *)real + guard_size; + for (;;) { + void *real = use_madvise ? memory_map_rw(real_alloc_size) : memory_map(real_alloc_size); + if (unlikely(real == NULL)) { + return NULL; + } + memory_set_name(real, real_alloc_size, name); - size_t lead_size = align((uintptr_t)usable, alignment) - (uintptr_t)usable; - size_t trail_size = alloc_size - lead_size - usable_size; - void *base = (char *)usable + lead_size; + void *usable = (char *)real + guard_size; - if (unlikely(memory_protect_rw(base, usable_size))) { - memory_unmap(real, real_alloc_size); - return NULL; - } + size_t lead_size = align((uintptr_t)usable, alignment) - (uintptr_t)usable; + size_t trail_size = alloc_size - lead_size - usable_size; + void *base = (char *)usable + lead_size; - if (lead_size) { - if (unlikely(memory_unmap(real, lead_size))) { + if (!use_madvise && unlikely(memory_protect_rw(base, usable_size))) { memory_unmap(real, real_alloc_size); return NULL; } - } - if (trail_size) { - if (unlikely(memory_unmap((char *)base + usable_size + guard_size, trail_size))) { - memory_unmap(real, real_alloc_size); - return NULL; + if (lead_size) { + if (unlikely(memory_unmap(real, lead_size))) { + memory_unmap(real, real_alloc_size); + return NULL; + } } - } - return base; + if (trail_size) { + if (unlikely(memory_unmap((char *)base + usable_size + guard_size, trail_size))) { + memory_unmap(real, real_alloc_size); + return NULL; + } + } + + // on guard-install failure, retry with the PROT_NONE scheme + if (use_madvise && guard_size && (unlikely(memory_guard_install((char *)base - guard_size, guard_size)) || + unlikely(memory_guard_install((char *)base + usable_size, guard_size)))) { + memory_unmap((char *)base - guard_size, usable_size + guard_size * 2); + use_madvise = false; + continue; + } + + return base; + } } void deallocate_pages(void *usable, size_t usable_size, size_t guard_size) {