forked from SerenityOS/serenity
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LibC: Implement a faster memset routine for x86-64 in assembly
This commit addresses the following shortcomings of our current, simple and elegant memset function: - REP STOSB/STOSQ has considerable startup overhead, it's impractical to use for smaller sizes. - Up until very recently, AMD CPUs didn't have support for "Enhanced REP MOVSB/STOSB", so it performed pretty poorly on them. With this commit applied, I could measure a ~5% decrease in `test-js`'s runtime when I used qemu's TCG backend. The implementation is based on the following article from Microsoft: https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines Two versions of the routine are implemented: one that uses the ERMS extension mentioned above, and one that performs plain SSE stores. The version appropriate for the CPU is selected at load time using an IFUNC.
- Loading branch information
1 parent
484f70f
commit bcf124c
Showing
5 changed files
with
262 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
/* | ||
* Copyright (c) 2022, Daniel Bertalan <[email protected]> | ||
* | ||
* SPDX-License-Identifier: BSD-2-Clause | ||
*/ | ||
|
||
// Optimized x86-64 memset routine based on the following post from the MSRC blog: | ||
// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines | ||
// | ||
// This algorithm | ||
// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is | ||
// qemu's TCG backend used in CI) | ||
// - uses SSE stores otherwise | ||
// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have | ||
// a large overhead | ||
|
||
.intel_syntax noprefix | ||
|
||
.global memset_sse2_erms | ||
.type memset_sse2_erms, @function | ||
.p2align 4 | ||
|
||
memset_sse2_erms: | ||
// Fill all bytes of esi and xmm0 with the given character. | ||
movzx esi, sil | ||
imul esi, 0x01010101 | ||
movd xmm0, esi | ||
pshufd xmm0, xmm0, 0 | ||
|
||
// Store the original address for the return value. | ||
mov rax, rdi | ||
|
||
cmp rdx, 64 | ||
jb .Lunder_64 | ||
|
||
// Limit taken from the article. Could be lower (256 or 512) if we want to | ||
// tune it for the latest CPUs. | ||
cmp rdx, 800 | ||
jb .Lbig | ||
|
||
.Lerms: | ||
// We're going to align the pointer to 64 bytes, and then use REP STOSB. | ||
|
||
// Fill the first 64 bytes of the memory using SSE stores. | ||
movups [rdi], xmm0 | ||
movups [rdi + 16], xmm0 | ||
movups [rdi + 32], xmm0 | ||
movups [rdi + 48], xmm0 | ||
|
||
// Store the address of the last byte in r8. | ||
lea r8, [rdi + rdx] | ||
|
||
// Align the start pointer to 64 bytes. | ||
add rdi, 63 | ||
and rdi, ~63 | ||
|
||
// Calculate the number of remaining bytes to store. | ||
mov rcx, r8 | ||
sub rcx, rdi | ||
|
||
// Use REP STOSB to fill the rest. This is implemented in microcode on | ||
// recent Intel and AMD CPUs, and can automatically use the widest stores | ||
// available in the CPU, so it's strictly faster than SSE for sizes of more | ||
// than a couple hundred bytes. | ||
xchg rax, rsi | ||
rep stosb | ||
mov rax, rsi | ||
|
||
ret | ||
|
||
.global memset_sse2 | ||
.type memset_sse2, @function | ||
.p2align 4 | ||
|
||
memset_sse2: | ||
// Fill all bytes of esi and xmm0 with the given character. | ||
movzx esi, sil | ||
imul rsi, 0x01010101 | ||
movd xmm0, esi | ||
pshufd xmm0, xmm0, 0 | ||
|
||
// Store the original address for the return value. | ||
mov rax, rdi | ||
|
||
cmp rdx, 64 | ||
jb .Lunder_64 | ||
|
||
.Lbig: | ||
// We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot | ||
// loop, and then fill the last 48-64 bytes separately to take care of any | ||
// trailing bytes. | ||
|
||
// Fill the first 16 bytes, which might be unaligned. | ||
movups [rdi], xmm0 | ||
|
||
// Calculate the first 16 byte aligned address for the SSE stores. | ||
lea rsi, [rdi + 16] | ||
and rsi, ~15 | ||
|
||
// Calculate the number of remaining bytes. | ||
sub rdi, rsi | ||
add rdx, rdi | ||
|
||
// Calculate the last aligned address for trailing stores such that | ||
// 48-64 bytes are left. | ||
lea rcx, [rsi + rdx - 48] | ||
and rcx, ~15 | ||
|
||
// Calculate the address 16 bytes from the end. | ||
lea r8, [rsi + rdx - 16] | ||
|
||
cmp rdx, 64 | ||
jb .Ltrailing | ||
|
||
.Lbig_loop: | ||
// Fill 4*16 bytes in a loop. | ||
movaps [rsi], xmm0 | ||
movaps [rsi + 16], xmm0 | ||
movaps [rsi + 32], xmm0 | ||
movaps [rsi + 48], xmm0 | ||
|
||
add rsi, 64 | ||
cmp rsi, rcx | ||
jb .Lbig_loop | ||
|
||
.Ltrailing: | ||
// We have 48-64 bytes left. Fill the first 48 and the last 16 bytes. | ||
movaps [rcx], xmm0 | ||
movaps [rcx + 16], xmm0 | ||
movaps [rcx + 32], xmm0 | ||
movups [r8], xmm0 | ||
|
||
ret | ||
|
||
.Lunder_64: | ||
cmp rdx, 16 | ||
jb .Lunder_16 | ||
|
||
// We're going to fill 16-63 bytes using variable sized branchess stores. | ||
// Although this means that we might set the same byte up to 4 times, we | ||
// can avoid branching which is expensive compared to straight-line code. | ||
|
||
// Calculate the address of the last SSE store. | ||
lea r8, [rdi + rdx - 16] | ||
|
||
// Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0. | ||
and rdx, 32 | ||
|
||
// Fill the first 16 bytes. | ||
movups [rdi], xmm0 | ||
|
||
// Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0. | ||
shr rdx, 1 | ||
|
||
// Fill the last 16 bytes. | ||
movups [r8], xmm0 | ||
|
||
// Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again. | ||
movups [rdi + rdx], xmm0 | ||
|
||
// Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again. | ||
neg rdx | ||
movups [r8 + rdx], xmm0 | ||
|
||
ret | ||
|
||
.Lunder_16: | ||
cmp rdx, 4 | ||
jb .Lunder_4 | ||
|
||
// We're going to fill 4-15 bytes using variable sized branchless stores like above. | ||
lea r8, [rdi + rdx - 4] | ||
and rdx, 8 | ||
mov [rdi], esi | ||
shr rdx, 1 | ||
mov [r8], esi | ||
mov [rdi + rdx], esi | ||
neg rdx | ||
mov [r8 + rdx], esi | ||
ret | ||
|
||
.Lunder_4: | ||
cmp rdx, 1 | ||
jb .Lend | ||
|
||
// Fill the first byte. | ||
mov [rdi], sil | ||
|
||
jbe .Lend | ||
|
||
// The size is 2 or 3 bytes. Fill the second and the last one. | ||
mov [rdi + 1], sil | ||
mov [rdi + rdx - 1], sil | ||
|
||
.Lend: | ||
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/* | ||
* Copyright (c) 2022, Daniel Bertalan <[email protected]> | ||
* | ||
* SPDX-License-Identifier: BSD-2-Clause | ||
*/ | ||
|
||
#include <AK/Types.h> | ||
#include <cpuid.h> | ||
#include <string.h> | ||
|
||
extern "C" { | ||
|
||
extern void* memset_sse2(void*, int, size_t); | ||
extern void* memset_sse2_erms(void*, int, size_t); | ||
|
||
constexpr u32 tcg_signature_ebx = 0x54474354; | ||
constexpr u32 tcg_signature_ecx = 0x43544743; | ||
constexpr u32 tcg_signature_edx = 0x47435447; | ||
|
||
// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB" | ||
constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9; | ||
|
||
namespace { | ||
[[gnu::used]] decltype(&memset) resolve_memset() | ||
{ | ||
u32 eax, ebx, ecx, edx; | ||
|
||
__cpuid(0x40000000, eax, ebx, ecx, edx); | ||
bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx; | ||
|
||
// Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than | ||
// SSE copies on all data sizes except <= 4 bytes. | ||
if (is_tcg) | ||
return memset_sse2; | ||
|
||
__cpuid_count(7, 0, eax, ebx, ecx, edx); | ||
if (ebx & cpuid_7_ebx_bit_erms) | ||
return memset_sse2_erms; | ||
|
||
return memset_sse2; | ||
} | ||
} | ||
|
||
#if !defined(__clang__) && !defined(_DYNAMIC_LOADER) | ||
[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t); | ||
#else | ||
// DynamicLoader can't self-relocate IFUNCs. | ||
// FIXME: There's a circular dependency between LibC and libunwind when built with Clang, | ||
// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses. | ||
void* memset(void* dest_ptr, int c, size_t n) | ||
{ | ||
static decltype(&memset) s_impl = nullptr; | ||
if (s_impl == nullptr) | ||
s_impl = resolve_memset(); | ||
|
||
return s_impl(dest_ptr, c, n); | ||
} | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters