From f1da129642c57ce2548bf78e57fd9121b7e76d1c Mon Sep 17 00:00:00 2001
From: Dmitry Lenev <dmitry.lenev@percona.com>
Date: Fri, 15 May 2026 13:20:38 +0200
Subject: [PATCH] PS-11161 [8.0]: mem_root_deque performance optimizations
 (part 1).

Reduce target size of mem_root_deque block to 256 bytes from 1Kb.

In the majority of cases mem_root_deque is used to store pointers
(e.g. to Item objects), hence the 1kB block can store up to 128
elements for 64-bit arch.
OTOH in many cases the number of elements which is really used in
mem_root_deque is much lower. For example, for many queries number
of fields in SELECT list is far smaller than 64.

This means that in many cases the bigger part of such a 1kB block
is just wasted. Taking into account that there are 5-10 of
mem_root_deque instances even for fairly simple queries the effect
of such waste becomes more pronounced - allocating these extra
unnecessary 5-10Kb on main MEM_ROOT can trigger it to request
another block from malloc(). And the latter has small but visible
impact on performance in some of sysbench tests.
---
 include/mem_root_deque.h           | 23 ++++++++++++++++++-----
 unittest/gunit/mem_root_deque-t.cc | 19 +++++++++++++------
 2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/include/mem_root_deque.h b/include/mem_root_deque.h
index ea325cd33e3c..b7ac8984e278 100644
--- a/include/mem_root_deque.h
+++ b/include/mem_root_deque.h
@@ -35,19 +35,32 @@
 
 template <class Element_type>
 static constexpr size_t FindElementsPerBlock() {
-  // Aim for 1 kB.
-  size_t base_number_elems =
-      1024 / sizeof(Element_type);  // NOLINT(bugprone-sizeof-expression)
+  // Upstream aims for 1 kB. In Percona Server we prefer 256 byte blocks
+  // instead.
+  //
+  // In the majority of cases mem_root_deque is used to store pointers
+  // (e.g. to Item objects). Also in many cases number of those pointers
+  // is going to be much lower than 128 (number of elements which fit
+  // 1kb page on 64-bit system). This means that most of the 1kB block
+  // is going to be wasted in those cases. The effect is multiplied by
+  // the fact that even relatively simple query can use 5-10 mem_root_deque
+  // instances. Allocating those unnecessary 5-10kb on main MEM_ROOT
+  // often triggers allocation of new MEM_ROOT block creating small but
+  // visible performance overhead.
+  const size_t target_block_bytes = 256;
+  const size_t base_number_elems =
+      target_block_bytes /
+      sizeof(Element_type);  // NOLINT(bugprone-sizeof-expression)
 
   // Find the next power of two, rounded up. We should have at least 16 elements
   // per block to avoid allocating way too often (although the code itself
   // should work fine with 1, for debugging purposes).
-  for (size_t block_size = 16; block_size < 1024; ++block_size) {
+  for (size_t block_size = 16; block_size < target_block_bytes; ++block_size) {
     if (block_size >= base_number_elems) {
       return block_size;
     }
   }
-  return 1024;
+  return target_block_bytes;
 }
 
 /**
diff --git a/unittest/gunit/mem_root_deque-t.cc b/unittest/gunit/mem_root_deque-t.cc
index 9f6eb3dbc746..d13f87fece5e 100644
--- a/unittest/gunit/mem_root_deque-t.cc
+++ b/unittest/gunit/mem_root_deque-t.cc
@@ -281,10 +281,11 @@ TEST(MemRootDequeTest, ExponentialGrowth) {
   }
 
   // Check exponential growth occurred (not linear).
-  // With ~256 elements per block (1024 bytes / 4 bytes per int),
-  // 1000 elements need ~4 blocks.
-  // Growth sequence: 1 -> 2 -> 4 blocks allocated, exp goes 0 -> 1 -> 2
-  EXPECT_GE(d.back_growth_exp(), 2);   // At least 2 growth events
+  // With ~64 elements per block (256 bytes / 4 bytes per int),
+  // 1000 elements need ~16 blocks.
+  // Growth sequence: 1 -> 2 -> 4 -> 8 -> 16 blocks allocated,
+  // exp goes 0 -> 1 -> 2 -> 3 -> 4
+  EXPECT_GE(d.back_growth_exp(), 4);   // At least 4 growth events
   EXPECT_EQ(0, d.front_growth_exp());  // Front not grown yet
   EXPECT_EQ(0, d.first_block_idx());   // Blocks start at index 0
 
@@ -307,7 +308,7 @@ TEST(MemRootDequeTest, ExponentialGrowth) {
   }
 
   // Check front exponential growth occurred independently
-  EXPECT_GE(d.front_growth_exp(), 2);  // At least 2 growth events
+  EXPECT_GE(d.front_growth_exp(), 4);  // At least 4 growth events
   // Back exponent should not have changed (back didn't need more blocks)
   EXPECT_EQ(back_exp_after_push_back, d.back_growth_exp());
   // first_block_idx should have moved to accommodate front spare slots
@@ -319,7 +320,7 @@ TEST(MemRootDequeTest, ExponentialGrowth) {
   // blocks_allocated should be at most ~3x the minimum needed
   // (due to spare slots for future growth)
   size_t min_blocks_needed =
-      (d.size() + 255) / 256;  // ~8 blocks for 2000 elements
+      (d.size() + 63) / 64;  // ~32 blocks for 2000 elements
   EXPECT_LE(d.block_slots(), min_blocks_needed * 3);  // Allow 3x for spare
 }
 
@@ -356,6 +357,12 @@ TEST(MemRootDequeTest, MemoryEfficiency) {
 // Verifies exponential growth pattern (not quadratic).
 TEST(MemRootDequeTest, MemoryGrowthPushBack) {
   MEM_ROOT mem_root;
+  // Set MEM_ROOT block size low enough to avoid situation when two
+  // mem_root_deque blocks fit into a single MEM_ROOT block (this is
+  // necessary because MEM_ROOT::allocated_size() returns total size
+  // of memory used by MEM_ROOT blocks, and not the net size of memory
+  // allocated on it).
+  mem_root.set_block_size(128);
   mem_root_deque<int> d(&mem_root);
 
   std::vector<size_t> memory_at_growth;