summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibC
diff options
context:
space:
mode:
authorDaniel Bertalan <dani@danielbertalan.dev>2022-03-24 16:41:18 +0100
committerAndreas Kling <kling@serenityos.org>2022-05-01 12:42:01 +0200
commitbcf124c07d3f4ec753e7e09007dda27a2e4e9c27 (patch)
tree88aa8d79bbba1e2ff7679b16f0783dc7a706057f /Userland/Libraries/LibC
parent484f70fb438187e5e51102f989d52957f11313fb (diff)
downloadserenity-bcf124c07d3f4ec753e7e09007dda27a2e4e9c27.zip
LibC: Implement a faster memset routine for x86-64 in assembly
This commit addresses the following shortcomings of our current, simple and elegant memset function: - REP STOSB/STOSQ has considerable startup overhead, it's impractical to use for smaller sizes. - Up until very recently, AMD CPUs didn't have support for "Enhanced REP MOVSB/STOSB", so it performed pretty poorly on them. With this commit applied, I could measure a ~5% decrease in `test-js`'s runtime when I used qemu's TCG backend. The implementation is based on the following article from Microsoft: https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines Two versions of the routine are implemented: one that uses the ERMS extension mentioned above, and one that performs plain SSE stores. The version appropriate for the CPU is selected at load time using an IFUNC.
Diffstat (limited to 'Userland/Libraries/LibC')
-rw-r--r--Userland/Libraries/LibC/CMakeLists.txt3
-rw-r--r--Userland/Libraries/LibC/arch/x86_64/memset.S196
-rw-r--r--Userland/Libraries/LibC/arch/x86_64/memset.cpp59
-rw-r--r--Userland/Libraries/LibC/string.cpp12
4 files changed, 261 insertions, 9 deletions
diff --git a/Userland/Libraries/LibC/CMakeLists.txt b/Userland/Libraries/LibC/CMakeLists.txt
index f463e977ad..eaad935054 100644
--- a/Userland/Libraries/LibC/CMakeLists.txt
+++ b/Userland/Libraries/LibC/CMakeLists.txt
@@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
set(CRTI_SOURCE "arch/i386/crti.S")
set(CRTN_SOURCE "arch/i386/crtn.S")
elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
- set(ASM_SOURCES "arch/x86_64/setjmp.S")
+ set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
+ set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
set(CRTI_SOURCE "arch/x86_64/crti.S")
set(CRTN_SOURCE "arch/x86_64/crtn.S")
diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.S b/Userland/Libraries/LibC/arch/x86_64/memset.S
new file mode 100644
index 0000000000..1c8ef221c0
--- /dev/null
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+// Optimized x86-64 memset routine based on the following post from the MSRC blog:
+// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
+//
+// This algorithm
+// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
+// qemu's TCG backend used in CI)
+// - uses SSE stores otherwise
+// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
+// a large overhead
+
+.intel_syntax noprefix
+
+.global memset_sse2_erms
+.type memset_sse2_erms, @function
+.p2align 4
+
+memset_sse2_erms:
+ // Fill all bytes of esi and xmm0 with the given character.
+ movzx esi, sil
+ imul esi, 0x01010101
+ movd xmm0, esi
+ pshufd xmm0, xmm0, 0
+
+ // Store the original address for the return value.
+ mov rax, rdi
+
+ cmp rdx, 64
+ jb .Lunder_64
+
+ // Limit taken from the article. Could be lower (256 or 512) if we want to
+ // tune it for the latest CPUs.
+ cmp rdx, 800
+ jb .Lbig
+
+.Lerms:
+ // We're going to align the pointer to 64 bytes, and then use REP STOSB.
+
+ // Fill the first 64 bytes of the memory using SSE stores.
+ movups [rdi], xmm0
+ movups [rdi + 16], xmm0
+ movups [rdi + 32], xmm0
+ movups [rdi + 48], xmm0
+
+ // Store the address of the last byte in r8.
+ lea r8, [rdi + rdx]
+
+ // Align the start pointer to 64 bytes.
+ add rdi, 63
+ and rdi, ~63
+
+ // Calculate the number of remaining bytes to store.
+ mov rcx, r8
+ sub rcx, rdi
+
+ // Use REP STOSB to fill the rest. This is implemented in microcode on
+ // recent Intel and AMD CPUs, and can automatically use the widest stores
+ // available in the CPU, so it's strictly faster than SSE for sizes of more
+ // than a couple hundred bytes.
+ xchg rax, rsi
+ rep stosb
+ mov rax, rsi
+
+ ret
+
+.global memset_sse2
+.type memset_sse2, @function
+.p2align 4
+
+memset_sse2:
+ // Fill all bytes of esi and xmm0 with the given character.
+ movzx esi, sil
+ imul rsi, 0x01010101
+ movd xmm0, esi
+ pshufd xmm0, xmm0, 0
+
+ // Store the original address for the return value.
+ mov rax, rdi
+
+ cmp rdx, 64
+ jb .Lunder_64
+
+.Lbig:
+ // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
+ // loop, and then fill the last 48-64 bytes separately to take care of any
+ // trailing bytes.
+
+ // Fill the first 16 bytes, which might be unaligned.
+ movups [rdi], xmm0
+
+ // Calculate the first 16 byte aligned address for the SSE stores.
+ lea rsi, [rdi + 16]
+ and rsi, ~15
+
+ // Calculate the number of remaining bytes.
+ sub rdi, rsi
+ add rdx, rdi
+
+ // Calculate the last aligned address for trailing stores such that
+ // 48-64 bytes are left.
+ lea rcx, [rsi + rdx - 48]
+ and rcx, ~15
+
+ // Calculate the address 16 bytes from the end.
+ lea r8, [rsi + rdx - 16]
+
+ cmp rdx, 64
+ jb .Ltrailing
+
+.Lbig_loop:
+ // Fill 4*16 bytes in a loop.
+ movaps [rsi], xmm0
+ movaps [rsi + 16], xmm0
+ movaps [rsi + 32], xmm0
+ movaps [rsi + 48], xmm0
+
+ add rsi, 64
+ cmp rsi, rcx
+ jb .Lbig_loop
+
+.Ltrailing:
+ // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
+ movaps [rcx], xmm0
+ movaps [rcx + 16], xmm0
+ movaps [rcx + 32], xmm0
+ movups [r8], xmm0
+
+ ret
+
+.Lunder_64:
+ cmp rdx, 16
+ jb .Lunder_16
+
+ // We're going to fill 16-63 bytes using variable sized branchess stores.
+ // Although this means that we might set the same byte up to 4 times, we
+ // can avoid branching which is expensive compared to straight-line code.
+
+ // Calculate the address of the last SSE store.
+ lea r8, [rdi + rdx - 16]
+
+ // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
+ and rdx, 32
+
+ // Fill the first 16 bytes.
+ movups [rdi], xmm0
+
+ // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
+ shr rdx, 1
+
+ // Fill the last 16 bytes.
+ movups [r8], xmm0
+
+ // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
+ movups [rdi + rdx], xmm0
+
+ // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
+ neg rdx
+ movups [r8 + rdx], xmm0
+
+ ret
+
+.Lunder_16:
+ cmp rdx, 4
+ jb .Lunder_4
+
+ // We're going to fill 4-15 bytes using variable sized branchless stores like above.
+ lea r8, [rdi + rdx - 4]
+ and rdx, 8
+ mov [rdi], esi
+ shr rdx, 1
+ mov [r8], esi
+ mov [rdi + rdx], esi
+ neg rdx
+ mov [r8 + rdx], esi
+ ret
+
+.Lunder_4:
+ cmp rdx, 1
+ jb .Lend
+
+ // Fill the first byte.
+ mov [rdi], sil
+
+ jbe .Lend
+
+ // The size is 2 or 3 bytes. Fill the second and the last one.
+ mov [rdi + 1], sil
+ mov [rdi + rdx - 1], sil
+
+.Lend:
+ ret
diff --git a/Userland/Libraries/LibC/arch/x86_64/memset.cpp b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
new file mode 100644
index 0000000000..a5e58a2d5b
--- /dev/null
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Types.h>
+#include <cpuid.h>
+#include <string.h>
+
+extern "C" {
+
+extern void* memset_sse2(void*, int, size_t);
+extern void* memset_sse2_erms(void*, int, size_t);
+
+constexpr u32 tcg_signature_ebx = 0x54474354;
+constexpr u32 tcg_signature_ecx = 0x43544743;
+constexpr u32 tcg_signature_edx = 0x47435447;
+
+// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
+constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;
+
+namespace {
+[[gnu::used]] decltype(&memset) resolve_memset()
+{
+ u32 eax, ebx, ecx, edx;
+
+ __cpuid(0x40000000, eax, ebx, ecx, edx);
+ bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;
+
+ // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
+ // SSE copies on all data sizes except <= 4 bytes.
+ if (is_tcg)
+ return memset_sse2;
+
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
+ if (ebx & cpuid_7_ebx_bit_erms)
+ return memset_sse2_erms;
+
+ return memset_sse2;
+}
+}
+
+#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
+[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
+#else
+// DynamicLoader can't self-relocate IFUNCs.
+// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
+// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
+void* memset(void* dest_ptr, int c, size_t n)
+{
+ static decltype(&memset) s_impl = nullptr;
+ if (s_impl == nullptr)
+ s_impl = resolve_memset();
+
+ return s_impl(dest_ptr, c, n);
+}
+#endif
+}
diff --git a/Userland/Libraries/LibC/string.cpp b/Userland/Libraries/LibC/string.cpp
index 90847bf63c..054553d8cf 100644
--- a/Userland/Libraries/LibC/string.cpp
+++ b/Userland/Libraries/LibC/string.cpp
@@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
return original_dest;
}
+#if ARCH(I386)
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
+//
+// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
void* memset(void* dest_ptr, int c, size_t n)
{
size_t dest = (size_t)dest_ptr;
@@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n)
if (!(dest & 0x3) && n >= 12) {
size_t size_ts = n / sizeof(size_t);
size_t expanded_c = explode_byte((u8)c);
-#if ARCH(I386)
asm volatile(
"rep stosl\n"
: "=D"(dest)
: "D"(dest), "c"(size_ts), "a"(expanded_c)
: "memory");
-#else
- asm volatile(
- "rep stosq\n"
- : "=D"(dest)
- : "D"(dest), "c"(size_ts), "a"(expanded_c)
- : "memory");
-#endif
n -= size_ts * sizeof(size_t);
if (n == 0)
return dest_ptr;
@@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
: "memory");
return dest_ptr;
}
+#endif
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
void* memmove(void* dest, void const* src, size_t n)