summaryrefslogtreecommitdiff
path: root/Userland
diff options
context:
space:
mode:
Diffstat (limited to 'Userland')
-rw-r--r--Userland/DynamicLoader/CMakeLists.txt2
-rw-r--r--Userland/Libraries/LibC/CMakeLists.txt3
-rw-r--r--Userland/Libraries/LibC/arch/x86_64/memset.S196
-rw-r--r--Userland/Libraries/LibC/arch/x86_64/memset.cpp59
-rw-r--r--Userland/Libraries/LibC/string.cpp12
5 files changed, 262 insertions, 10 deletions
diff --git a/Userland/DynamicLoader/CMakeLists.txt b/Userland/DynamicLoader/CMakeLists.txt
index 50953797b4..a0f98f54b2 100644
--- a/Userland/DynamicLoader/CMakeLists.txt
+++ b/Userland/DynamicLoader/CMakeLists.txt
@@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S)
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
- file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S")
+ file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp")
set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S)
endif()
diff --git a/Userland/Libraries/LibC/CMakeLists.txt b/Userland/Libraries/LibC/CMakeLists.txt
index f463e977ad..eaad935054 100644
--- a/Userland/Libraries/LibC/CMakeLists.txt
+++ b/Userland/Libraries/LibC/CMakeLists.txt
@@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
set(CRTI_SOURCE "arch/i386/crti.S")
set(CRTN_SOURCE "arch/i386/crtn.S")
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
- set(ASM_SOURCES "arch/x86_64/setjmp.S")
+ set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
+ set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
set(CRTI_SOURCE "arch/x86_64/crti.S")
set(CRTN_SOURCE "arch/x86_64/crtn.S")
diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.S b/Userland/Libraries/LibC/arch/x86_64/memset.S
new file mode 100644
index 0000000000..1c8ef221c0
--- /dev/null
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+// Optimized x86-64 memset routine based on the following post from the MSRC blog:
+// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
+//
+// This algorithm
+// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
+// qemu's TCG backend used in CI)
+// - uses SSE stores otherwise
+// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
+// a large overhead
+
+.intel_syntax noprefix
+
+.global memset_sse2_erms
+.type memset_sse2_erms, @function
+.p2align 4
+
+memset_sse2_erms:
+ // Fill all bytes of esi and xmm0 with the given character.
+ movzx esi, sil
+ imul esi, 0x01010101
+ movd xmm0, esi
+ pshufd xmm0, xmm0, 0
+
+ // Store the original address for the return value.
+ mov rax, rdi
+
+ cmp rdx, 64
+ jb .Lunder_64
+
+ // Limit taken from the article. Could be lower (256 or 512) if we want to
+ // tune it for the latest CPUs.
+ cmp rdx, 800
+ jb .Lbig
+
+.Lerms:
+ // We're going to align the pointer to 64 bytes, and then use REP STOSB.
+
+ // Fill the first 64 bytes of the memory using SSE stores.
+ movups [rdi], xmm0
+ movups [rdi + 16], xmm0
+ movups [rdi + 32], xmm0
+ movups [rdi + 48], xmm0
+
+ // Store the address of the last byte in r8.
+ lea r8, [rdi + rdx]
+
+ // Align the start pointer to 64 bytes.
+ add rdi, 63
+ and rdi, ~63
+
+ // Calculate the number of remaining bytes to store.
+ mov rcx, r8
+ sub rcx, rdi
+
+ // Use REP STOSB to fill the rest. This is implemented in microcode on
+ // recent Intel and AMD CPUs, and can automatically use the widest stores
+ // available in the CPU, so it's strictly faster than SSE for sizes of more
+ // than a couple hundred bytes.
+ xchg rax, rsi
+ rep stosb
+ mov rax, rsi
+
+ ret
+
+.global memset_sse2
+.type memset_sse2, @function
+.p2align 4
+
+memset_sse2:
+ // Fill all bytes of esi and xmm0 with the given character.
+ movzx esi, sil
+ imul rsi, 0x01010101
+ movd xmm0, esi
+ pshufd xmm0, xmm0, 0
+
+ // Store the original address for the return value.
+ mov rax, rdi
+
+ cmp rdx, 64
+ jb .Lunder_64
+
+.Lbig:
+ // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
+ // loop, and then fill the last 48-64 bytes separately to take care of any
+ // trailing bytes.
+
+ // Fill the first 16 bytes, which might be unaligned.
+ movups [rdi], xmm0
+
+ // Calculate the first 16 byte aligned address for the SSE stores.
+ lea rsi, [rdi + 16]
+ and rsi, ~15
+
+ // Calculate the number of remaining bytes.
+ sub rdi, rsi
+ add rdx, rdi
+
+ // Calculate the last aligned address for trailing stores such that
+ // 48-64 bytes are left.
+ lea rcx, [rsi + rdx - 48]
+ and rcx, ~15
+
+ // Calculate the address 16 bytes from the end.
+ lea r8, [rsi + rdx - 16]
+
+ cmp rdx, 64
+ jb .Ltrailing
+
+.Lbig_loop:
+ // Fill 4*16 bytes in a loop.
+ movaps [rsi], xmm0
+ movaps [rsi + 16], xmm0
+ movaps [rsi + 32], xmm0
+ movaps [rsi + 48], xmm0
+
+ add rsi, 64
+ cmp rsi, rcx
+ jb .Lbig_loop
+
+.Ltrailing:
+ // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
+ movaps [rcx], xmm0
+ movaps [rcx + 16], xmm0
+ movaps [rcx + 32], xmm0
+ movups [r8], xmm0
+
+ ret
+
+.Lunder_64:
+ cmp rdx, 16
+ jb .Lunder_16
+
+ // We're going to fill 16-63 bytes using variable sized branchess stores.
+ // Although this means that we might set the same byte up to 4 times, we
+ // can avoid branching which is expensive compared to straight-line code.
+
+ // Calculate the address of the last SSE store.
+ lea r8, [rdi + rdx - 16]
+
+ // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
+ and rdx, 32
+
+ // Fill the first 16 bytes.
+ movups [rdi], xmm0
+
+ // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
+ shr rdx, 1
+
+ // Fill the last 16 bytes.
+ movups [r8], xmm0
+
+ // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
+ movups [rdi + rdx], xmm0
+
+ // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
+ neg rdx
+ movups [r8 + rdx], xmm0
+
+ ret
+
+.Lunder_16:
+ cmp rdx, 4
+ jb .Lunder_4
+
+ // We're going to fill 4-15 bytes using variable sized branchless stores like above.
+ lea r8, [rdi + rdx - 4]
+ and rdx, 8
+ mov [rdi], esi
+ shr rdx, 1
+ mov [r8], esi
+ mov [rdi + rdx], esi
+ neg rdx
+ mov [r8 + rdx], esi
+ ret
+
+.Lunder_4:
+ cmp rdx, 1
+ jb .Lend
+
+ // Fill the first byte.
+ mov [rdi], sil
+
+ jbe .Lend
+
+ // The size is 2 or 3 bytes. Fill the second and the last one.
+ mov [rdi + 1], sil
+ mov [rdi + rdx - 1], sil
+
+.Lend:
+ ret
diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.cpp b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
new file mode 100644
index 0000000000..a5e58a2d5b
--- /dev/null
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Types.h>
+#include <cpuid.h>
+#include <string.h>
+
+extern "C" {
+
+extern void* memset_sse2(void*, int, size_t);
+extern void* memset_sse2_erms(void*, int, size_t);
+
+constexpr u32 tcg_signature_ebx = 0x54474354;
+constexpr u32 tcg_signature_ecx = 0x43544743;
+constexpr u32 tcg_signature_edx = 0x47435447;
+
+// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
+constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;
+
+namespace {
+[[gnu::used]] decltype(&memset) resolve_memset()
+{
+ u32 eax, ebx, ecx, edx;
+
+ __cpuid(0x40000000, eax, ebx, ecx, edx);
+ bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;
+
+ // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
+ // SSE copies on all data sizes except <= 4 bytes.
+ if (is_tcg)
+ return memset_sse2;
+
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
+ if (ebx & cpuid_7_ebx_bit_erms)
+ return memset_sse2_erms;
+
+ return memset_sse2;
+}
+}
+
+#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
+[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
+#else
+// DynamicLoader can't self-relocate IFUNCs.
+// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
+// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
+void* memset(void* dest_ptr, int c, size_t n)
+{
+ static decltype(&memset) s_impl = nullptr;
+ if (s_impl == nullptr)
+ s_impl = resolve_memset();
+
+ return s_impl(dest_ptr, c, n);
+}
+#endif
+}
diff --git a/Userland/Libraries/LibC/string.cpp b/Userland/Libraries/LibC/string.cpp
index 90847bf63c..054553d8cf 100644
--- a/Userland/Libraries/LibC/string.cpp
+++ b/Userland/Libraries/LibC/string.cpp
@@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
return original_dest;
}
+#if ARCH(I386)
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
+//
+// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
void* memset(void* dest_ptr, int c, size_t n)
{
size_t dest = (size_t)dest_ptr;
@@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n)
if (!(dest & 0x3) && n >= 12) {
size_t size_ts = n / sizeof(size_t);
size_t expanded_c = explode_byte((u8)c);
-#if ARCH(I386)
asm volatile(
"rep stosl\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
-#else
- asm volatile(
- "rep stosq\n"
- : "=D"(dest)
- : "D"(dest), "c"(size_ts), "a"(expanded_c)
- : "memory");
-#endif
n -= size_ts * sizeof(size_t);
if (n == 0)
return dest_ptr;
@@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
: "memory");
return dest_ptr;
}
+#endif
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
void* memmove(void* dest, void const* src, size_t n)