Skip to content

Commit

Permalink
LibC: Implement a faster memset routine for x86-64 in assembly
Browse files Browse the repository at this point in the history
This commit addresses the following shortcomings of our current, simple
and elegant memset function:
- REP STOSB/STOSQ has considerable startup overhead, it's impractical to
  use for smaller sizes.
- Up until very recently, AMD CPUs didn't have support for "Enhanced REP
  MOVSB/STOSB", so it performed pretty poorly on them.

With this commit applied, I could measure a ~5% decrease in `test-js`'s
runtime when I used qemu's TCG backend. The implementation is based on
the following article from Microsoft:

https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines

Two versions of the routine are implemented: one that uses the ERMS
extension mentioned above, and one that performs plain SSE stores. The
version appropriate for the CPU is selected at load time using an IFUNC.
  • Loading branch information
BertalanD authored and awesomekling committed May 1, 2022
1 parent 484f70f commit bcf124c
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 10 deletions.
2 changes: 1 addition & 1 deletion Userland/DynamicLoader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S)
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S)
endif()

Expand Down
3 changes: 2 additions & 1 deletion Userland/Libraries/LibC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
set(CRTI_SOURCE "arch/i386/crti.S")
set(CRTN_SOURCE "arch/i386/crtn.S")
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
set(ASM_SOURCES "arch/x86_64/setjmp.S")
set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
set(CRTI_SOURCE "arch/x86_64/crti.S")
set(CRTN_SOURCE "arch/x86_64/crtn.S")
Expand Down
196 changes: 196 additions & 0 deletions Userland/Libraries/LibC/arch/x86_64/memset.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Copyright (c) 2022, Daniel Bertalan <[email protected]>
*
* SPDX-License-Identifier: BSD-2-Clause
*/

// Optimized x86-64 memset routine based on the following post from the MSRC blog:
// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
//
// This algorithm
// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
// qemu's TCG backend used in CI)
// - uses SSE stores otherwise
// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
// a large overhead

.intel_syntax noprefix

.global memset_sse2_erms
.type memset_sse2_erms, @function
.p2align 4

memset_sse2_erms:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul esi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0

// Store the original address for the return value.
mov rax, rdi

cmp rdx, 64
jb .Lunder_64

// Limit taken from the article. Could be lower (256 or 512) if we want to
// tune it for the latest CPUs.
cmp rdx, 800
jb .Lbig

.Lerms:
// We're going to align the pointer to 64 bytes, and then use REP STOSB.

// Fill the first 64 bytes of the memory using SSE stores.
movups [rdi], xmm0
movups [rdi + 16], xmm0
movups [rdi + 32], xmm0
movups [rdi + 48], xmm0

// Store the address of the last byte in r8.
lea r8, [rdi + rdx]

// Align the start pointer to 64 bytes.
add rdi, 63
and rdi, ~63

// Calculate the number of remaining bytes to store.
mov rcx, r8
sub rcx, rdi

// Use REP STOSB to fill the rest. This is implemented in microcode on
// recent Intel and AMD CPUs, and can automatically use the widest stores
// available in the CPU, so it's strictly faster than SSE for sizes of more
// than a couple hundred bytes.
xchg rax, rsi
rep stosb
mov rax, rsi

ret

.global memset_sse2
.type memset_sse2, @function
.p2align 4

memset_sse2:
// Fill all bytes of esi and xmm0 with the given character.
movzx esi, sil
imul rsi, 0x01010101
movd xmm0, esi
pshufd xmm0, xmm0, 0

// Store the original address for the return value.
mov rax, rdi

cmp rdx, 64
jb .Lunder_64

.Lbig:
// We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
// loop, and then fill the last 48-64 bytes separately to take care of any
// trailing bytes.

// Fill the first 16 bytes, which might be unaligned.
movups [rdi], xmm0

// Calculate the first 16 byte aligned address for the SSE stores.
lea rsi, [rdi + 16]
and rsi, ~15

// Calculate the number of remaining bytes.
sub rdi, rsi
add rdx, rdi

// Calculate the last aligned address for trailing stores such that
// 48-64 bytes are left.
lea rcx, [rsi + rdx - 48]
and rcx, ~15

// Calculate the address 16 bytes from the end.
lea r8, [rsi + rdx - 16]

cmp rdx, 64
jb .Ltrailing

.Lbig_loop:
// Fill 4*16 bytes in a loop.
movaps [rsi], xmm0
movaps [rsi + 16], xmm0
movaps [rsi + 32], xmm0
movaps [rsi + 48], xmm0

add rsi, 64
cmp rsi, rcx
jb .Lbig_loop

.Ltrailing:
// We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
movaps [rcx], xmm0
movaps [rcx + 16], xmm0
movaps [rcx + 32], xmm0
movups [r8], xmm0

ret

.Lunder_64:
cmp rdx, 16
jb .Lunder_16

// We're going to fill 16-63 bytes using variable sized branchess stores.
// Although this means that we might set the same byte up to 4 times, we
// can avoid branching which is expensive compared to straight-line code.

// Calculate the address of the last SSE store.
lea r8, [rdi + rdx - 16]

// Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
and rdx, 32

// Fill the first 16 bytes.
movups [rdi], xmm0

// Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
shr rdx, 1

// Fill the last 16 bytes.
movups [r8], xmm0

// Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
movups [rdi + rdx], xmm0

// Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
neg rdx
movups [r8 + rdx], xmm0

ret

.Lunder_16:
cmp rdx, 4
jb .Lunder_4

// We're going to fill 4-15 bytes using variable sized branchless stores like above.
lea r8, [rdi + rdx - 4]
and rdx, 8
mov [rdi], esi
shr rdx, 1
mov [r8], esi
mov [rdi + rdx], esi
neg rdx
mov [r8 + rdx], esi
ret

.Lunder_4:
cmp rdx, 1
jb .Lend

// Fill the first byte.
mov [rdi], sil

jbe .Lend

// The size is 2 or 3 bytes. Fill the second and the last one.
mov [rdi + 1], sil
mov [rdi + rdx - 1], sil

.Lend:
ret
59 changes: 59 additions & 0 deletions Userland/Libraries/LibC/arch/x86_64/memset.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2022, Daniel Bertalan <[email protected]>
*
* SPDX-License-Identifier: BSD-2-Clause
*/

#include <AK/Types.h>
#include <cpuid.h>
#include <string.h>

extern "C" {

extern void* memset_sse2(void*, int, size_t);
extern void* memset_sse2_erms(void*, int, size_t);

constexpr u32 tcg_signature_ebx = 0x54474354;
constexpr u32 tcg_signature_ecx = 0x43544743;
constexpr u32 tcg_signature_edx = 0x47435447;

// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;

namespace {
[[gnu::used]] decltype(&memset) resolve_memset()
{
u32 eax, ebx, ecx, edx;

__cpuid(0x40000000, eax, ebx, ecx, edx);
bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;

// Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
// SSE copies on all data sizes except <= 4 bytes.
if (is_tcg)
return memset_sse2;

__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_7_ebx_bit_erms)
return memset_sse2_erms;

return memset_sse2;
}
}

#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
#else
// DynamicLoader can't self-relocate IFUNCs.
// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
void* memset(void* dest_ptr, int c, size_t n)
{
static decltype(&memset) s_impl = nullptr;
if (s_impl == nullptr)
s_impl = resolve_memset();

return s_impl(dest_ptr, c, n);
}
#endif
}
12 changes: 4 additions & 8 deletions Userland/Libraries/LibC/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,27 +137,22 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
return original_dest;
}

#if ARCH(I386)
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
//
// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
void* memset(void* dest_ptr, int c, size_t n)
{
size_t dest = (size_t)dest_ptr;
// FIXME: Support starting at an unaligned address.
if (!(dest & 0x3) && n >= 12) {
size_t size_ts = n / sizeof(size_t);
size_t expanded_c = explode_byte((u8)c);
#if ARCH(I386)
asm volatile(
"rep stosl\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
#else
asm volatile(
"rep stosq\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
#endif
n -= size_ts * sizeof(size_t);
if (n == 0)
return dest_ptr;
Expand All @@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
: "memory");
return dest_ptr;
}
#endif

// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
void* memmove(void* dest, void const* src, size_t n)
Expand Down

0 comments on commit bcf124c

Please sign in to comment.