diff options
author | Daniel Bertalan <dani@danielbertalan.dev> | 2022-03-24 16:41:18 +0100 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2022-05-01 12:42:01 +0200 |
commit | bcf124c07d3f4ec753e7e09007dda27a2e4e9c27 (patch) | |
tree | 88aa8d79bbba1e2ff7679b16f0783dc7a706057f /Userland | |
parent | 484f70fb438187e5e51102f989d52957f11313fb (diff) | |
download | serenity-bcf124c07d3f4ec753e7e09007dda27a2e4e9c27.zip |
LibC: Implement a faster memset routine for x86-64 in assembly
This commit addresses the following shortcomings of our current, simple
and elegant memset function:
- REP STOSB/STOSQ has considerable startup overhead, it's impractical to
use for smaller sizes.
- Up until very recently, AMD CPUs didn't have support for "Enhanced REP
MOVSB/STOSB", so it performed pretty poorly on them.
With this commit applied, I could measure a ~5% decrease in `test-js`'s
runtime when I used qemu's TCG backend. The implementation is based on
the following article from Microsoft:
https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
Two versions of the routine are implemented: one that uses the ERMS
extension mentioned above, and one that performs plain SSE stores. The
version appropriate for the CPU is selected at load time using an IFUNC.
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/DynamicLoader/CMakeLists.txt | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibC/CMakeLists.txt | 3 | ||||
-rw-r--r-- | Userland/Libraries/LibC/arch/x86_64/memset.S | 196 | ||||
-rw-r--r-- | Userland/Libraries/LibC/arch/x86_64/memset.cpp | 59 | ||||
-rw-r--r-- | Userland/Libraries/LibC/string.cpp | 12 |
5 files changed, 262 insertions, 10 deletions
diff --git a/Userland/DynamicLoader/CMakeLists.txt b/Userland/DynamicLoader/CMakeLists.txt index 50953797b4..a0f98f54b2 100644 --- a/Userland/DynamicLoader/CMakeLists.txt +++ b/Userland/DynamicLoader/CMakeLists.txt @@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686") file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S") set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S) elseif ("${SERENITY_ARCH}" STREQUAL "x86_64") - file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S") + file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp") set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S) endif() diff --git a/Userland/Libraries/LibC/CMakeLists.txt b/Userland/Libraries/LibC/CMakeLists.txt index f463e977ad..eaad935054 100644 --- a/Userland/Libraries/LibC/CMakeLists.txt +++ b/Userland/Libraries/LibC/CMakeLists.txt @@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686") set(CRTI_SOURCE "arch/i386/crti.S") set(CRTN_SOURCE "arch/i386/crtn.S") elseif ("${SERENITY_ARCH}" STREQUAL "x86_64") - set(ASM_SOURCES "arch/x86_64/setjmp.S") + set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp") + set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S") set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S) set(CRTI_SOURCE "arch/x86_64/crti.S") set(CRTN_SOURCE "arch/x86_64/crtn.S") diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.S b/Userland/Libraries/LibC/arch/x86_64/memset.S new file mode 100644 index 0000000000..1c8ef221c0 --- /dev/null +++ b/Userland/Libraries/LibC/arch/x86_64/memset.S @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +// Optimized x86-64 memset routine based on the following post from the MSRC blog: +// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines +// +// This algorithm +// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is +// qemu's TCG backend used in CI) +// - uses SSE stores otherwise +// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have +// a large overhead + +.intel_syntax noprefix + +.global memset_sse2_erms +.type memset_sse2_erms, @function +.p2align 4 + +memset_sse2_erms: + // Fill all bytes of esi and xmm0 with the given character. + movzx esi, sil + imul esi, 0x01010101 + movd xmm0, esi + pshufd xmm0, xmm0, 0 + + // Store the original address for the return value. + mov rax, rdi + + cmp rdx, 64 + jb .Lunder_64 + + // Limit taken from the article. Could be lower (256 or 512) if we want to + // tune it for the latest CPUs. + cmp rdx, 800 + jb .Lbig + +.Lerms: + // We're going to align the pointer to 64 bytes, and then use REP STOSB. + + // Fill the first 64 bytes of the memory using SSE stores. + movups [rdi], xmm0 + movups [rdi + 16], xmm0 + movups [rdi + 32], xmm0 + movups [rdi + 48], xmm0 + + // Store the address of the last byte in r8. + lea r8, [rdi + rdx] + + // Align the start pointer to 64 bytes. + add rdi, 63 + and rdi, ~63 + + // Calculate the number of remaining bytes to store. + mov rcx, r8 + sub rcx, rdi + + // Use REP STOSB to fill the rest. This is implemented in microcode on + // recent Intel and AMD CPUs, and can automatically use the widest stores + // available in the CPU, so it's strictly faster than SSE for sizes of more + // than a couple hundred bytes. + xchg rax, rsi + rep stosb + mov rax, rsi + + ret + +.global memset_sse2 +.type memset_sse2, @function +.p2align 4 + +memset_sse2: + // Fill all bytes of esi and xmm0 with the given character. + movzx esi, sil + imul rsi, 0x01010101 + movd xmm0, esi + pshufd xmm0, xmm0, 0 + + // Store the original address for the return value. + mov rax, rdi + + cmp rdx, 64 + jb .Lunder_64 + +.Lbig: + // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot + // loop, and then fill the last 48-64 bytes separately to take care of any + // trailing bytes. + + // Fill the first 16 bytes, which might be unaligned. + movups [rdi], xmm0 + + // Calculate the first 16 byte aligned address for the SSE stores. + lea rsi, [rdi + 16] + and rsi, ~15 + + // Calculate the number of remaining bytes. + sub rdi, rsi + add rdx, rdi + + // Calculate the last aligned address for trailing stores such that + // 48-64 bytes are left. + lea rcx, [rsi + rdx - 48] + and rcx, ~15 + + // Calculate the address 16 bytes from the end. + lea r8, [rsi + rdx - 16] + + cmp rdx, 64 + jb .Ltrailing + +.Lbig_loop: + // Fill 4*16 bytes in a loop. + movaps [rsi], xmm0 + movaps [rsi + 16], xmm0 + movaps [rsi + 32], xmm0 + movaps [rsi + 48], xmm0 + + add rsi, 64 + cmp rsi, rcx + jb .Lbig_loop + +.Ltrailing: + // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes. + movaps [rcx], xmm0 + movaps [rcx + 16], xmm0 + movaps [rcx + 32], xmm0 + movups [r8], xmm0 + + ret + +.Lunder_64: + cmp rdx, 16 + jb .Lunder_16 + + // We're going to fill 16-63 bytes using variable sized branchess stores. + // Although this means that we might set the same byte up to 4 times, we + // can avoid branching which is expensive compared to straight-line code. + + // Calculate the address of the last SSE store. + lea r8, [rdi + rdx - 16] + + // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0. + and rdx, 32 + + // Fill the first 16 bytes. + movups [rdi], xmm0 + + // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0. + shr rdx, 1 + + // Fill the last 16 bytes. + movups [r8], xmm0 + + // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again. + movups [rdi + rdx], xmm0 + + // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again. + neg rdx + movups [r8 + rdx], xmm0 + + ret + +.Lunder_16: + cmp rdx, 4 + jb .Lunder_4 + + // We're going to fill 4-15 bytes using variable sized branchless stores like above. + lea r8, [rdi + rdx - 4] + and rdx, 8 + mov [rdi], esi + shr rdx, 1 + mov [r8], esi + mov [rdi + rdx], esi + neg rdx + mov [r8 + rdx], esi + ret + +.Lunder_4: + cmp rdx, 1 + jb .Lend + + // Fill the first byte. + mov [rdi], sil + + jbe .Lend + + // The size is 2 or 3 bytes. Fill the second and the last one. + mov [rdi + 1], sil + mov [rdi + rdx - 1], sil + +.Lend: + ret diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.cpp b/Userland/Libraries/LibC/arch/x86_64/memset.cpp new file mode 100644 index 0000000000..a5e58a2d5b --- /dev/null +++ b/Userland/Libraries/LibC/arch/x86_64/memset.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/Types.h> +#include <cpuid.h> +#include <string.h> + +extern "C" { + +extern void* memset_sse2(void*, int, size_t); +extern void* memset_sse2_erms(void*, int, size_t); + +constexpr u32 tcg_signature_ebx = 0x54474354; +constexpr u32 tcg_signature_ecx = 0x43544743; +constexpr u32 tcg_signature_edx = 0x47435447; + +// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB" +constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9; + +namespace { +[[gnu::used]] decltype(&memset) resolve_memset() +{ + u32 eax, ebx, ecx, edx; + + __cpuid(0x40000000, eax, ebx, ecx, edx); + bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx; + + // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than + // SSE copies on all data sizes except <= 4 bytes. + if (is_tcg) + return memset_sse2; + + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (ebx & cpuid_7_ebx_bit_erms) + return memset_sse2_erms; + + return memset_sse2; +} +} + +#if !defined(__clang__) && !defined(_DYNAMIC_LOADER) +[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t); +#else +// DynamicLoader can't self-relocate IFUNCs. +// FIXME: There's a circular dependency between LibC and libunwind when built with Clang, +// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses. +void* memset(void* dest_ptr, int c, size_t n) +{ + static decltype(&memset) s_impl = nullptr; + if (s_impl == nullptr) + s_impl = resolve_memset(); + + return s_impl(dest_ptr, c, n); +} +#endif +} diff --git a/Userland/Libraries/LibC/string.cpp b/Userland/Libraries/LibC/string.cpp index 90847bf63c..054553d8cf 100644 --- a/Userland/Libraries/LibC/string.cpp +++ b/Userland/Libraries/LibC/string.cpp @@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n) return original_dest; } +#if ARCH(I386) // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html +// +// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S void* memset(void* dest_ptr, int c, size_t n) { size_t dest = (size_t)dest_ptr; @@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n) if (!(dest & 0x3) && n >= 12) { size_t size_ts = n / sizeof(size_t); size_t expanded_c = explode_byte((u8)c); -#if ARCH(I386) asm volatile( "rep stosl\n" : "=D"(dest) : "D"(dest), "c"(size_ts), "a"(expanded_c) : "memory"); -#else - asm volatile( - "rep stosq\n" - : "=D"(dest) - : "D"(dest), "c"(size_ts), "a"(expanded_c) - : "memory"); -#endif n -= size_ts * sizeof(size_t); if (n == 0) return dest_ptr; @@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n) : "memory"); return dest_ptr; } +#endif // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html void* memmove(void* dest, void const* src, size_t n) |