From 99d2465ee3eb563a1ab2d7c9e9da52b7b03b8970 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 16 Mar 2024 21:13:10 +0200 Subject: [PATCH] smp: allocate hugepages eagerly when kernel support is available Instead of deferring merging pages into hugepages to the transparent hugepage scanner, advise the kernel to do so immediately using the new MADV_POPULATE_WRITE and MADV_COLLAPSE advices. Refactor the prefaulter to attempt first to use MAP_POPULATE_WRITE to fault in a whole hugepage's worth of memory. This should fault the range as a hugepage but for good measure use MADV_COLLAPSE too (which would be a no-op if the work was done in MADV_POPULATE_WRITE). --- src/core/prefault.hh | 3 +- src/core/smp.cc | 88 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 19 deletions(-) diff --git a/src/core/prefault.hh b/src/core/prefault.hh index c84685e1954..a12f38a6c55 100644 --- a/src/core/prefault.hh +++ b/src/core/prefault.hh @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -40,7 +41,7 @@ public: explicit memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout); ~memory_prefaulter(); private: - void work(std::vector& ranges, size_t page_size); + void work(std::vector& ranges, size_t page_size, std::optional huge_page_size_opt); }; diff --git a/src/core/smp.cc b/src/core/smp.cc index c43b4919cfa..eb01d2718ac 100644 --- a/src/core/smp.cc +++ b/src/core/smp.cc @@ -25,6 +25,10 @@ module; #include #include #include +#include +#include +#include +#include #ifdef SEASTAR_MODULE module seastar; @@ -35,6 +39,8 @@ module seastar; #include #include #include +#include +#include #include "prefault.hh" #endif @@ -173,11 +179,32 @@ smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory:: #endif } +static +std::optional +get_huge_page_size() { + auto meminfo_fd = file_desc::open("/proc/meminfo", O_RDONLY | O_CLOEXEC); + std::string meminfo; + char buf[4096]; + while (auto size_opt = meminfo_fd.read(buf, sizeof(buf))) { + if (!*size_opt) { + break; + } + meminfo.append(buf, *size_opt); + } + static std::regex re(R"(Hugepagesize:\s*(\d+) kB)"); + auto m = std::smatch{}; + if (std::regex_search(meminfo, m, re)) { + return std::stoi(m[1]) * size_t(1024); + } + return std::nullopt; +} + internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout) { for (auto& range : layout.ranges) { _layout_by_node_id[range.numa_node_id].push_back(std::move(range)); } auto page_size = getpagesize(); + auto huge_page_size_opt = get_huge_page_size(); for (auto& numa_node_id_and_ranges : _layout_by_node_id) { auto& numa_node_id = numa_node_id_and_ranges.first; auto& ranges = numa_node_id_and_ranges.second; @@ -191,8 +218,8 @@ internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, m } a.set(cpuset); } - _worker_threads.emplace_back(a, [this, &ranges, page_size] { - work(ranges, page_size); + _worker_threads.emplace_back(a, [this, &ranges, page_size, huge_page_size_opt] { + work(ranges, page_size, huge_page_size_opt); }); } } @@ -205,12 +232,33 @@ internal::memory_prefaulter::~memory_prefaulter() { } void -internal::memory_prefaulter::work(std::vector& ranges, size_t page_size) { +internal::memory_prefaulter::work(std::vector& ranges, size_t page_size, + std::optional huge_page_size_opt) { sched_param param = { .sched_priority = 0 }; // SCHED_IDLE doesn't work via thread attributes pthread_setschedparam(pthread_self(), SCHED_IDLE, ¶m); size_t current_range = 0; - const size_t batch_size = 512; // happens to match huge page size on x86, through not critical + const size_t batch_size = huge_page_size_opt.value_or(512*4096); + auto populate_memory_madvise = [works = true] (char* start, char* end) mutable { +#ifdef MADV_POPULATE_WRITE + if (works) { + auto result = madvise(start, end - start, MADV_POPULATE_WRITE); + if (result == -1 && errno == EINVAL) { + // Unsupported by kernel + works = false; + return false; + } + // Ignore other errors. This is just an optimization anyway. +#ifdef MADV_COLLAPSE + madvise(start, end - start, MADV_COLLAPSE); + // Also ignore problems with MADV_COLLAPSE, it's just an optimization. +#endif + return true; + }; +#endif + (void)works; // suppress warning if block above is elided + return false; + }; auto fault_in_memory = [] (char* p) { // Touch the page for write, but be sure not to modify anything // The compilers tend to optimize things, so prefer assembly @@ -227,25 +275,29 @@ internal::memory_prefaulter::work(std::vector& r p1->fetch_or(0, std::memory_order_relaxed); #endif }; - while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) { - auto& range = ranges[current_range]; - // copy eveything into locals, or the optimizer will worry about them due - // to the cast below and not optimize anything - auto start = range.start; - auto end = range.end; - for (size_t i = 0; i < batch_size && start < end; ++i) { + auto populate_memory = [&] (char* start, char* end) { + if (populate_memory_madvise(start, end)) { + return; + } + while (start < end) { fault_in_memory(start); start += page_size; } + }; + while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) { + auto& range = ranges[current_range]; + + auto batch_end = std::min(align_up(range.start + 1, batch_size), range.end); + populate_memory(range.start, batch_end); + range.start = batch_end; + // An end-to-start scan for applications that manage two heaps that // grow towards each other. - for (size_t i = 0; i < batch_size && start < end; ++i) { - fault_in_memory(end - 1); - end -= page_size; - } - range.start = start; - range.end = end; - if (start >= end) { + auto batch_start = std::max(align_down(range.end - 1, batch_size), range.start); + populate_memory(batch_start, range.end); + range.end = batch_start; + + if (range.start >= range.end) { ranges.erase(ranges.begin() + current_range); current_range = 0; }