diff options
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/DynamicLoader/CMakeLists.txt | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibC/CMakeLists.txt | 3 | ||||
-rw-r--r-- | Userland/Libraries/LibC/arch/x86_64/memset.S | 196 | ||||
-rw-r--r-- | Userland/Libraries/LibC/arch/x86_64/memset.cpp | 59 | ||||
-rw-r--r-- | Userland/Libraries/LibC/string.cpp | 12 |
5 files changed, 262 insertions, 10 deletions
diff --git a/Userland/DynamicLoader/CMakeLists.txt b/Userland/DynamicLoader/CMakeLists.txt index 50953797b4..a0f98f54b2 100644 --- a/Userland/DynamicLoader/CMakeLists.txt +++ b/Userland/DynamicLoader/CMakeLists.txt @@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686") file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S") set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S) elseif ("${SERENITY_ARCH}" STREQUAL "x86_64") - file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S") + file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp") set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S) endif() diff --git a/Userland/Libraries/LibC/CMakeLists.txt b/Userland/Libraries/LibC/CMakeLists.txt index f463e977ad..eaad935054 100644 --- a/Userland/Libraries/LibC/CMakeLists.txt +++ b/Userland/Libraries/LibC/CMakeLists.txt @@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686") set(CRTI_SOURCE "arch/i386/crti.S") set(CRTN_SOURCE "arch/i386/crtn.S") elseif ("${SERENITY_ARCH}" STREQUAL "x86_64") - set(ASM_SOURCES "arch/x86_64/setjmp.S") + set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp") + set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S") set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S) set(CRTI_SOURCE "arch/x86_64/crti.S") set(CRTN_SOURCE "arch/x86_64/crtn.S") diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.S b/Userland/Libraries/LibC/arch/x86_64/memset.S new file mode 100644 index 0000000000..1c8ef221c0 --- /dev/null +++ b/Userland/Libraries/LibC/arch/x86_64/memset.S @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +// Optimized x86-64 memset routine based on the following post from the MSRC blog: +// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines +// +// This algorithm +// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is +// qemu's TCG backend used in CI) +// - uses SSE stores otherwise +// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have +// a large overhead + +.intel_syntax noprefix + +.global memset_sse2_erms +.type memset_sse2_erms, @function +.p2align 4 + +memset_sse2_erms: + // Fill all bytes of esi and xmm0 with the given character. + movzx esi, sil + imul esi, 0x01010101 + movd xmm0, esi + pshufd xmm0, xmm0, 0 + + // Store the original address for the return value. + mov rax, rdi + + cmp rdx, 64 + jb .Lunder_64 + + // Limit taken from the article. Could be lower (256 or 512) if we want to + // tune it for the latest CPUs. + cmp rdx, 800 + jb .Lbig + +.Lerms: + // We're going to align the pointer to 64 bytes, and then use REP STOSB. + + // Fill the first 64 bytes of the memory using SSE stores. + movups [rdi], xmm0 + movups [rdi + 16], xmm0 + movups [rdi + 32], xmm0 + movups [rdi + 48], xmm0 + + // Store the address of the last byte in r8. + lea r8, [rdi + rdx] + + // Align the start pointer to 64 bytes. + add rdi, 63 + and rdi, ~63 + + // Calculate the number of remaining bytes to store. + mov rcx, r8 + sub rcx, rdi + + // Use REP STOSB to fill the rest. This is implemented in microcode on + // recent Intel and AMD CPUs, and can automatically use the widest stores + // available in the CPU, so it's strictly faster than SSE for sizes of more + // than a couple hundred bytes. + xchg rax, rsi + rep stosb + mov rax, rsi + + ret + +.global memset_sse2 +.type memset_sse2, @function +.p2align 4 + +memset_sse2: + // Fill all bytes of esi and xmm0 with the given character. + movzx esi, sil + imul rsi, 0x01010101 + movd xmm0, esi + pshufd xmm0, xmm0, 0 + + // Store the original address for the return value. + mov rax, rdi + + cmp rdx, 64 + jb .Lunder_64 + +.Lbig: + // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot + // loop, and then fill the last 48-64 bytes separately to take care of any + // trailing bytes. + + // Fill the first 16 bytes, which might be unaligned. + movups [rdi], xmm0 + + // Calculate the first 16 byte aligned address for the SSE stores. + lea rsi, [rdi + 16] + and rsi, ~15 + + // Calculate the number of remaining bytes. + sub rdi, rsi + add rdx, rdi + + // Calculate the last aligned address for trailing stores such that + // 48-64 bytes are left. + lea rcx, [rsi + rdx - 48] + and rcx, ~15 + + // Calculate the address 16 bytes from the end. + lea r8, [rsi + rdx - 16] + + cmp rdx, 64 + jb .Ltrailing + +.Lbig_loop: + // Fill 4*16 bytes in a loop. + movaps [rsi], xmm0 + movaps [rsi + 16], xmm0 + movaps [rsi + 32], xmm0 + movaps [rsi + 48], xmm0 + + add rsi, 64 + cmp rsi, rcx + jb .Lbig_loop + +.Ltrailing: + // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes. + movaps [rcx], xmm0 + movaps [rcx + 16], xmm0 + movaps [rcx + 32], xmm0 + movups [r8], xmm0 + + ret + +.Lunder_64: + cmp rdx, 16 + jb .Lunder_16 + + // We're going to fill 16-63 bytes using variable sized branchess stores. + // Although this means that we might set the same byte up to 4 times, we + // can avoid branching which is expensive compared to straight-line code. + + // Calculate the address of the last SSE store. + lea r8, [rdi + rdx - 16] + + // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0. + and rdx, 32 + + // Fill the first 16 bytes. + movups [rdi], xmm0 + + // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0. + shr rdx, 1 + + // Fill the last 16 bytes. + movups [r8], xmm0 + + // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again. + movups [rdi + rdx], xmm0 + + // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again. + neg rdx + movups [r8 + rdx], xmm0 + + ret + +.Lunder_16: + cmp rdx, 4 + jb .Lunder_4 + + // We're going to fill 4-15 bytes using variable sized branchless stores like above. + lea r8, [rdi + rdx - 4] + and rdx, 8 + mov [rdi], esi + shr rdx, 1 + mov [r8], esi + mov [rdi + rdx], esi + neg rdx + mov [r8 + rdx], esi + ret + +.Lunder_4: + cmp rdx, 1 + jb .Lend + + // Fill the first byte. + mov [rdi], sil + + jbe .Lend + + // The size is 2 or 3 bytes. Fill the second and the last one. + mov [rdi + 1], sil + mov [rdi + rdx - 1], sil + +.Lend: + ret diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.cpp b/Userland/Libraries/LibC/arch/x86_64/memset.cpp new file mode 100644 index 0000000000..a5e58a2d5b --- /dev/null +++ b/Userland/Libraries/LibC/arch/x86_64/memset.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/Types.h> +#include <cpuid.h> +#include <string.h> + +extern "C" { + +extern void* memset_sse2(void*, int, size_t); +extern void* memset_sse2_erms(void*, int, size_t); + +constexpr u32 tcg_signature_ebx = 0x54474354; +constexpr u32 tcg_signature_ecx = 0x43544743; +constexpr u32 tcg_signature_edx = 0x47435447; + +// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB" +constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9; + +namespace { +[[gnu::used]] decltype(&memset) resolve_memset() +{ + u32 eax, ebx, ecx, edx; + + __cpuid(0x40000000, eax, ebx, ecx, edx); + bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx; + + // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than + // SSE copies on all data sizes except <= 4 bytes. + if (is_tcg) + return memset_sse2; + + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (ebx & cpuid_7_ebx_bit_erms) + return memset_sse2_erms; + + return memset_sse2; +} +} + +#if !defined(__clang__) && !defined(_DYNAMIC_LOADER) +[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t); +#else +// DynamicLoader can't self-relocate IFUNCs. +// FIXME: There's a circular dependency between LibC and libunwind when built with Clang, +// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses. +void* memset(void* dest_ptr, int c, size_t n) +{ + static decltype(&memset) s_impl = nullptr; + if (s_impl == nullptr) + s_impl = resolve_memset(); + + return s_impl(dest_ptr, c, n); +} +#endif +} diff --git a/Userland/Libraries/LibC/string.cpp b/Userland/Libraries/LibC/string.cpp index 90847bf63c..054553d8cf 100644 --- a/Userland/Libraries/LibC/string.cpp +++ b/Userland/Libraries/LibC/string.cpp @@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n) return original_dest; } +#if ARCH(I386) // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html +// +// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S void* memset(void* dest_ptr, int c, size_t n) { size_t dest = (size_t)dest_ptr; @@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n) if (!(dest & 0x3) && n >= 12) { size_t size_ts = n / sizeof(size_t); size_t expanded_c = explode_byte((u8)c); -#if ARCH(I386) asm volatile( "rep stosl\n" : "=D"(dest) : "D"(dest), "c"(size_ts), "a"(expanded_c) : "memory"); -#else - asm volatile( - "rep stosq\n" - : "=D"(dest) - : "D"(dest), "c"(size_ts), "a"(expanded_c) - : "memory"); -#endif n -= size_ts * sizeof(size_t); if (n == 0) return dest_ptr; @@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n) : "memory"); return dest_ptr; } +#endif // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html void* memmove(void* dest, void const* src, size_t n) |