diff --git a/include/mem_root_deque.h b/include/mem_root_deque.h index ea325cd33e3c..b7ac8984e278 100644 --- a/include/mem_root_deque.h +++ b/include/mem_root_deque.h @@ -35,19 +35,32 @@ template static constexpr size_t FindElementsPerBlock() { - // Aim for 1 kB. - size_t base_number_elems = - 1024 / sizeof(Element_type); // NOLINT(bugprone-sizeof-expression) + // Upstream aims for 1 kB. In Percona Server we prefer 256 byte blocks + // instead. + // + // In the majority of cases mem_root_deque is used to store pointers + // (e.g. to Item objects). Also in many cases number of those pointers + // is going to be much lower than 128 (number of elements which fit + // 1kb page on 64-bit system). This means that most of the 1kB block + // is going to be wasted in those cases. The effect is multiplied by + // the fact that even relatively simple query can use 5-10 mem_root_deque + // instances. Allocating those unnecessary 5-10kb on main MEM_ROOT + // often triggers allocation of new MEM_ROOT block creating small but + // visible performance overhead. + const size_t target_block_bytes = 256; + const size_t base_number_elems = + target_block_bytes / + sizeof(Element_type); // NOLINT(bugprone-sizeof-expression) // Find the next power of two, rounded up. We should have at least 16 elements // per block to avoid allocating way too often (although the code itself // should work fine with 1, for debugging purposes). - for (size_t block_size = 16; block_size < 1024; ++block_size) { + for (size_t block_size = 16; block_size < target_block_bytes; ++block_size) { if (block_size >= base_number_elems) { return block_size; } } - return 1024; + return target_block_bytes; } /** diff --git a/unittest/gunit/mem_root_deque-t.cc b/unittest/gunit/mem_root_deque-t.cc index 9f6eb3dbc746..d13f87fece5e 100644 --- a/unittest/gunit/mem_root_deque-t.cc +++ b/unittest/gunit/mem_root_deque-t.cc @@ -281,10 +281,11 @@ TEST(MemRootDequeTest, ExponentialGrowth) { } // Check exponential growth occurred (not linear). - // With ~256 elements per block (1024 bytes / 4 bytes per int), - // 1000 elements need ~4 blocks. - // Growth sequence: 1 -> 2 -> 4 blocks allocated, exp goes 0 -> 1 -> 2 - EXPECT_GE(d.back_growth_exp(), 2); // At least 2 growth events + // With ~64 elements per block (256 bytes / 4 bytes per int), + // 1000 elements need ~16 blocks. + // Growth sequence: 1 -> 2 -> 4 -> 8 -> 16 blocks allocated, + // exp goes 0 -> 1 -> 2 -> 3 -> 4 + EXPECT_GE(d.back_growth_exp(), 4); // At least 4 growth events EXPECT_EQ(0, d.front_growth_exp()); // Front not grown yet EXPECT_EQ(0, d.first_block_idx()); // Blocks start at index 0 @@ -307,7 +308,7 @@ TEST(MemRootDequeTest, ExponentialGrowth) { } // Check front exponential growth occurred independently - EXPECT_GE(d.front_growth_exp(), 2); // At least 2 growth events + EXPECT_GE(d.front_growth_exp(), 4); // At least 4 growth events // Back exponent should not have changed (back didn't need more blocks) EXPECT_EQ(back_exp_after_push_back, d.back_growth_exp()); // first_block_idx should have moved to accommodate front spare slots @@ -319,7 +320,7 @@ TEST(MemRootDequeTest, ExponentialGrowth) { // blocks_allocated should be at most ~3x the minimum needed // (due to spare slots for future growth) size_t min_blocks_needed = - (d.size() + 255) / 256; // ~8 blocks for 2000 elements + (d.size() + 63) / 64; // ~32 blocks for 2000 elements EXPECT_LE(d.block_slots(), min_blocks_needed * 3); // Allow 3x for spare } @@ -356,6 +357,12 @@ TEST(MemRootDequeTest, MemoryEfficiency) { // Verifies exponential growth pattern (not quadratic). TEST(MemRootDequeTest, MemoryGrowthPushBack) { MEM_ROOT mem_root; + // Set MEM_ROOT block size low enough to avoid situation when two + // mem_root_deque blocks fit into a single MEM_ROOT block (this is + // necessary because MEM_ROOT::allocated_size() returns total size + // of memory used by MEM_ROOT blocks, and not the net size of memory + // allocated on it). + mem_root.set_block_size(128); mem_root_deque d(&mem_root); std::vector memory_at_growth;