LibC: Implement a faster memset routine for x86-64 in assembly

This commit addresses the following shortcomings of our current, simple and elegant memset function: - REP STOSB/STOSQ has considerable startup overhead, it's impractical to use for smaller sizes. - Up until very recently, AMD CPUs didn't have support for "Enhanced REP MOVSB/STOSB", so it performed pretty poorly on them. With this commit applied, I could measure a ~5% decrease in `test-js`'s runtime when I used qemu's TCG backend. The implementation is based on the following article from Microsoft: https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines Two versions of the routine are implemented: one that uses the ERMS extension mentioned above, and one that performs plain SSE stores. The version appropriate for the CPU is selected at load time using an IFUNC.
gmbows · May 1, 2022 · bcf124c · bcf124c
1 parent 484f70f
commit bcf124c
Show file tree

Hide file tree

Showing 5 changed files with 262 additions and 10 deletions.
diff --git a/Userland/DynamicLoader/CMakeLists.txt b/Userland/DynamicLoader/CMakeLists.txt
@@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
     file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S")
     set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S)
 elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
-    file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S")
+    file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp")
     set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S)
 endif()
 

diff --git a/Userland/Libraries/LibC/CMakeLists.txt b/Userland/Libraries/LibC/CMakeLists.txt
@@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
     set(CRTI_SOURCE "arch/i386/crti.S")
     set(CRTN_SOURCE "arch/i386/crtn.S")
 elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
-    set(ASM_SOURCES "arch/x86_64/setjmp.S")
+    set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
+    set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
     set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
     set(CRTI_SOURCE "arch/x86_64/crti.S")
     set(CRTN_SOURCE "arch/x86_64/crtn.S")

diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.S b/Userland/Libraries/LibC/arch/x86_64/memset.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <[email protected]>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+// Optimized x86-64 memset routine based on the following post from the MSRC blog:
+// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
+//
+// This algorithm
+// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
+//   qemu's TCG backend used in CI)
+// - uses SSE stores otherwise
+// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
+//   a large overhead
+
+.intel_syntax noprefix
+
+.global  memset_sse2_erms
+.type    memset_sse2_erms, @function
+.p2align 4
+
+memset_sse2_erms:
+    // Fill all bytes of esi and xmm0 with the given character.
+    movzx  esi, sil
+    imul   esi, 0x01010101
+    movd   xmm0, esi
+    pshufd xmm0, xmm0, 0
+
+    // Store the original address for the return value.
+    mov rax, rdi
+
+    cmp rdx, 64
+    jb  .Lunder_64
+
+    // Limit taken from the article. Could be lower (256 or 512) if we want to
+    // tune it for the latest CPUs.
+    cmp rdx, 800
+    jb  .Lbig
+
+.Lerms:
+    // We're going to align the pointer to 64 bytes, and then use REP STOSB.
+
+    // Fill the first 64 bytes of the memory using SSE stores.
+    movups [rdi], xmm0
+    movups [rdi + 16], xmm0
+    movups [rdi + 32], xmm0
+    movups [rdi + 48], xmm0
+
+    // Store the address of the last byte in r8.
+    lea r8, [rdi + rdx]
+
+    // Align the start pointer to 64 bytes.
+    add rdi, 63
+    and rdi, ~63
+
+    // Calculate the number of remaining bytes to store.
+    mov rcx, r8
+    sub rcx, rdi
+
+    // Use REP STOSB to fill the rest. This is implemented in microcode on
+    // recent Intel and AMD CPUs, and can automatically use the widest stores
+    // available in the CPU, so it's strictly faster than SSE for sizes of more
+    // than a couple hundred bytes.
+    xchg rax, rsi
+    rep  stosb
+    mov  rax, rsi
+
+    ret
+
+.global  memset_sse2
+.type    memset_sse2, @function
+.p2align 4
+
+memset_sse2:
+    // Fill all bytes of esi and xmm0 with the given character.
+    movzx  esi, sil
+    imul   rsi, 0x01010101
+    movd   xmm0, esi
+    pshufd xmm0, xmm0, 0
+
+    // Store the original address for the return value.
+    mov rax, rdi
+
+    cmp rdx, 64
+    jb  .Lunder_64
+
+.Lbig:
+    // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
+    // loop, and then fill the last 48-64 bytes separately to take care of any
+    // trailing bytes.
+
+    // Fill the first 16 bytes, which might be unaligned.
+    movups [rdi], xmm0
+
+    // Calculate the first 16 byte aligned address for the SSE stores.
+    lea rsi, [rdi + 16]
+    and rsi, ~15
+
+    // Calculate the number of remaining bytes.
+    sub rdi, rsi
+    add rdx, rdi
+
+    // Calculate the last aligned address for trailing stores such that
+    // 48-64 bytes are left.
+    lea rcx, [rsi + rdx - 48]
+    and rcx, ~15
+
+    // Calculate the address 16 bytes from the end.
+    lea r8, [rsi + rdx - 16]
+
+    cmp rdx, 64
+    jb  .Ltrailing
+
+.Lbig_loop:
+    // Fill 4*16 bytes in a loop.
+    movaps [rsi], xmm0
+    movaps [rsi + 16], xmm0
+    movaps [rsi + 32], xmm0
+    movaps [rsi + 48], xmm0
+
+    add rsi, 64
+    cmp rsi, rcx
+    jb  .Lbig_loop
+
+.Ltrailing:
+    // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
+    movaps [rcx], xmm0
+    movaps [rcx + 16], xmm0
+    movaps [rcx + 32], xmm0
+    movups [r8], xmm0
+
+    ret
+
+.Lunder_64:
+    cmp rdx, 16
+    jb  .Lunder_16
+
+    // We're going to fill 16-63 bytes using variable sized branchess stores.
+    // Although this means that we might set the same byte up to 4 times, we
+    // can avoid branching which is expensive compared to straight-line code.
+
+    // Calculate the address of the last SSE store.
+    lea r8, [rdi + rdx - 16]
+
+    // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
+    and rdx, 32
+
+    // Fill the first 16 bytes.
+    movups [rdi], xmm0
+
+    // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
+    shr rdx, 1
+
+    // Fill the last 16 bytes.
+    movups [r8], xmm0
+
+    // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
+    movups [rdi + rdx], xmm0
+
+    // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
+    neg    rdx
+    movups [r8 + rdx], xmm0
+
+    ret
+
+.Lunder_16:
+    cmp rdx, 4
+    jb  .Lunder_4
+
+    // We're going to fill 4-15 bytes using variable sized branchless stores like above.
+    lea r8, [rdi + rdx - 4]
+    and rdx, 8
+    mov [rdi], esi
+    shr rdx, 1
+    mov [r8], esi
+    mov [rdi + rdx], esi
+    neg rdx
+    mov [r8 + rdx], esi
+    ret
+
+.Lunder_4:
+    cmp rdx, 1
+    jb  .Lend
+
+    // Fill the first byte.
+    mov [rdi], sil
+
+    jbe .Lend
+
+    // The size is 2 or 3 bytes. Fill the second and the last one.
+    mov [rdi + 1], sil
+    mov [rdi + rdx - 1], sil
+
+.Lend:
+    ret
diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.cpp b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <[email protected]>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Types.h>
+#include <cpuid.h>
+#include <string.h>
+
+extern "C" {
+
+extern void* memset_sse2(void*, int, size_t);
+extern void* memset_sse2_erms(void*, int, size_t);
+
+constexpr u32 tcg_signature_ebx = 0x54474354;
+constexpr u32 tcg_signature_ecx = 0x43544743;
+constexpr u32 tcg_signature_edx = 0x47435447;
+
+// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
+constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;
+
+namespace {
+[[gnu::used]] decltype(&memset) resolve_memset()
+{
+    u32 eax, ebx, ecx, edx;
+
+    __cpuid(0x40000000, eax, ebx, ecx, edx);
+    bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;
+
+    // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
+    // SSE copies on all data sizes except <= 4 bytes.
+    if (is_tcg)
+        return memset_sse2;
+
+    __cpuid_count(7, 0, eax, ebx, ecx, edx);
+    if (ebx & cpuid_7_ebx_bit_erms)
+        return memset_sse2_erms;
+
+    return memset_sse2;
+}
+}
+
+#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
+[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
+#else
+// DynamicLoader can't self-relocate IFUNCs.
+// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
+// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
+void* memset(void* dest_ptr, int c, size_t n)
+{
+    static decltype(&memset) s_impl = nullptr;
+    if (s_impl == nullptr)
+        s_impl = resolve_memset();
+
+    return s_impl(dest_ptr, c, n);
+}
+#endif
+}
diff --git a/Userland/Libraries/LibC/string.cpp b/Userland/Libraries/LibC/string.cpp
@@ -137,27 +137,22 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
     return original_dest;
 }
 
+#if ARCH(I386)
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
+//
+// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
 void* memset(void* dest_ptr, int c, size_t n)
 {
     size_t dest = (size_t)dest_ptr;
     // FIXME: Support starting at an unaligned address.
     if (!(dest & 0x3) && n >= 12) {
         size_t size_ts = n / sizeof(size_t);
         size_t expanded_c = explode_byte((u8)c);
-#if ARCH(I386)
         asm volatile(
             "rep stosl\n"
             : "=D"(dest)
             : "D"(dest), "c"(size_ts), "a"(expanded_c)
             : "memory");
-#else
-        asm volatile(
-            "rep stosq\n"
-            : "=D"(dest)
-            : "D"(dest), "c"(size_ts), "a"(expanded_c)
-            : "memory");
-#endif
         n -= size_ts * sizeof(size_t);
         if (n == 0)
             return dest_ptr;
@@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
         : "memory");
     return dest_ptr;
 }
+#endif
 
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
 void* memmove(void* dest, void const* src, size_t n)