From 99d2465ee3eb563a1ab2d7c9e9da52b7b03b8970 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@scylladb.com>
Date: Sat, 16 Mar 2024 21:13:10 +0200
Subject: [PATCH] smp: allocate hugepages eagerly when kernel support is
 available

Instead of deferring merging pages into hugepages to the transparent
hugepage scanner, advise the kernel to do so immediately using the new
MADV_POPULATE_WRITE and MADV_COLLAPSE advices.

Refactor the prefaulter to attempt first to use MAP_POPULATE_WRITE
to fault in a whole hugepage's worth of memory. This should fault
the range as a hugepage but for good measure use MADV_COLLAPSE too
(which would be a no-op if the work was done in MADV_POPULATE_WRITE).
---
 src/core/prefault.hh |  3 +-
 src/core/smp.cc      | 88 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 72 insertions(+), 19 deletions(-)
diff --git a/src/core/prefault.hh b/src/core/prefault.hh
index c84685e1954..a12f38a6c55 100644
--- a/src/core/prefault.hh
+++ b/src/core/prefault.hh
@@ -22,6 +22,7 @@
 
 #include <atomic>
 #include <map>
+#include <optional>
 
 #include <seastar/core/posix.hh>
 #include <seastar/core/resource.hh>
@@ -40,7 +41,7 @@ public:
     explicit memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout);
     ~memory_prefaulter();
 private:
-    void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size);
+    void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size, std::optional<size_t> huge_page_size_opt);
 };
 
 
diff --git a/src/core/smp.cc b/src/core/smp.cc
index c43b4919cfa..eb01d2718ac 100644
--- a/src/core/smp.cc
+++ b/src/core/smp.cc
@@ -25,6 +25,10 @@ module;
 #include <boost/range/algorithm/find_if.hpp>
 #include <memory>
 #include <vector>
+#include <regex>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
 
 #ifdef SEASTAR_MODULE
 module seastar;
@@ -35,6 +39,8 @@ module seastar;
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/print.hh>
 #include <seastar/core/on_internal_error.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/align.hh>
 #include "prefault.hh"
 #endif
 
@@ -173,11 +179,32 @@ smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory::
 #endif
 }
 
+static
+std::optional<size_t>
+get_huge_page_size() {
+    auto meminfo_fd = file_desc::open("/proc/meminfo", O_RDONLY | O_CLOEXEC);
+    std::string meminfo;
+    char buf[4096];
+    while (auto size_opt = meminfo_fd.read(buf, sizeof(buf))) {
+        if (!*size_opt) {
+            break;
+        }
+        meminfo.append(buf, *size_opt);
+    }
+    static std::regex re(R"(Hugepagesize:\s*(\d+) kB)");
+    auto m = std::smatch{};
+    if (std::regex_search(meminfo, m, re)) {
+        return std::stoi(m[1]) * size_t(1024);
+    }
+    return std::nullopt;
+}
+
 internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout) {
     for (auto& range : layout.ranges) {
         _layout_by_node_id[range.numa_node_id].push_back(std::move(range));
     }
     auto page_size = getpagesize();
+    auto huge_page_size_opt = get_huge_page_size();
     for (auto& numa_node_id_and_ranges : _layout_by_node_id) {
         auto& numa_node_id = numa_node_id_and_ranges.first;
         auto& ranges = numa_node_id_and_ranges.second;
@@ -191,8 +218,8 @@ internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, m
             }
             a.set(cpuset);
         }
-        _worker_threads.emplace_back(a, [this, &ranges, page_size] {
-            work(ranges, page_size);
+        _worker_threads.emplace_back(a, [this, &ranges, page_size, huge_page_size_opt] {
+            work(ranges, page_size, huge_page_size_opt);
         });
     }
 }
@@ -205,12 +232,33 @@ internal::memory_prefaulter::~memory_prefaulter() {
 }
 
 void
-internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& ranges, size_t page_size) {
+internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& ranges, size_t page_size,
+        std::optional<size_t> huge_page_size_opt) {
     sched_param param = { .sched_priority = 0 };
     // SCHED_IDLE doesn't work via thread attributes
     pthread_setschedparam(pthread_self(), SCHED_IDLE, &param);
     size_t current_range = 0;
-    const size_t batch_size = 512; // happens to match huge page size on x86, through not critical
+    const size_t batch_size = huge_page_size_opt.value_or(512*4096);
+    auto populate_memory_madvise = [works = true] (char* start, char* end) mutable {
+#ifdef MADV_POPULATE_WRITE
+        if (works) {
+            auto result = madvise(start, end - start, MADV_POPULATE_WRITE);
+            if (result == -1 && errno == EINVAL) {
+                // Unsupported by kernel
+                works = false;
+                return false;
+            }
+            // Ignore other errors. This is just an optimization anyway.
+#ifdef MADV_COLLAPSE
+            madvise(start, end - start, MADV_COLLAPSE);
+            // Also ignore problems with MADV_COLLAPSE, it's just an optimization.
+#endif
+            return true;
+        };
+#endif
+        (void)works; // suppress warning if block above is elided
+        return false;
+    };
     auto fault_in_memory = [] (char* p) {
         // Touch the page for write, but be sure not to modify anything
         // The compilers tend to optimize things, so prefer assembly
@@ -227,25 +275,29 @@ internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& r
         p1->fetch_or(0, std::memory_order_relaxed);
 #endif
     };
-    while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) {
-        auto& range = ranges[current_range];
-        // copy eveything into locals, or the optimizer will worry about them due
-        // to the cast below and not optimize anything
-        auto start = range.start;
-        auto end = range.end;
-        for (size_t i = 0; i < batch_size && start < end; ++i) {
+    auto populate_memory = [&] (char* start, char* end) {
+        if (populate_memory_madvise(start, end)) {
+            return;
+        }
+        while (start < end) {
             fault_in_memory(start);
             start += page_size;
         }
+    };
+    while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) {
+        auto& range = ranges[current_range];
+
+        auto batch_end = std::min(align_up(range.start + 1, batch_size), range.end);
+        populate_memory(range.start, batch_end);
+        range.start = batch_end;
+
         // An end-to-start scan for applications that manage two heaps that
         // grow towards each other.
-        for (size_t i = 0; i < batch_size && start < end; ++i) {
-            fault_in_memory(end - 1);
-            end -= page_size;
-        }
-        range.start = start;
-        range.end = end;
-        if (start >= end) {
+        auto batch_start = std::max(align_down(range.end - 1, batch_size), range.start);
+        populate_memory(batch_start, range.end);
+        range.end = batch_start;
+
+        if (range.start >= range.end) {
             ranges.erase(ranges.begin() + current_range);
             current_range = 0;
         }