%!s(int64=3) %!d(string=hai) anos · bcf124c07d
--- a/Userland/DynamicLoader/CMakeLists.txt
+++ b/Userland/DynamicLoader/CMakeLists.txt
@@ -15,7 +15,7 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
 
															     file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/i386/*.S")
														
 
															     set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/i386/entry.S ../Libraries/LibELF/Arch/i386/plt_trampoline.S)
														
 
															 elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
														
 
															-    file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S")
														
 
															+    file(GLOB LIBC_SOURCES3 "../Libraries/LibC/arch/x86_64/*.S" "../Libraries/LibC/arch/x86_64/*.cpp")
														
 
															     set(ELF_SOURCES ${ELF_SOURCES} ../Libraries/LibELF/Arch/x86_64/entry.S ../Libraries/LibELF/Arch/x86_64/plt_trampoline.S)
														
 
															 endif()
														
--- a/Userland/Libraries/LibC/CMakeLists.txt
+++ b/Userland/Libraries/LibC/CMakeLists.txt
@@ -84,7 +84,8 @@ elseif ("${SERENITY_ARCH}" STREQUAL "i686")
 
															     set(CRTI_SOURCE "arch/i386/crti.S")
														
 
															     set(CRTN_SOURCE "arch/i386/crtn.S")
														
 
															 elseif ("${SERENITY_ARCH}" STREQUAL "x86_64")
														
 
															-    set(ASM_SOURCES "arch/x86_64/setjmp.S")
														
 
															+    set(LIBC_SOURCES ${LIBC_SOURCES} "arch/x86_64/memset.cpp")
														
 
															+    set(ASM_SOURCES "arch/x86_64/setjmp.S" "arch/x86_64/memset.S")
														
 
															     set(ELF_SOURCES ${ELF_SOURCES} ../LibELF/Arch/x86_64/entry.S ../LibELF/Arch/x86_64/plt_trampoline.S)
														
 
															     set(CRTI_SOURCE "arch/x86_64/crti.S")
														
 
															     set(CRTN_SOURCE "arch/x86_64/crtn.S")
														
--- a/Userland/Libraries/LibC/arch/x86_64/memset.S
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.S
@@ -0,0 +1,196 @@
 
															+/*
														
 
															+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
														
 
															+ *
														
 
															+ * SPDX-License-Identifier: BSD-2-Clause
														
 
															+ */
														
 
															+
														
 
															+// Optimized x86-64 memset routine based on the following post from the MSRC blog:
														
 
															+// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
														
 
															+//
														
 
															+// This algorithm
														
 
															+// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
														
 
															+//   qemu's TCG backend used in CI)
														
 
															+// - uses SSE stores otherwise
														
 
															+// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
														
 
															+//   a large overhead
														
 
															+
														
 
															+.intel_syntax noprefix
														
 
															+
														
 
															+.global  memset_sse2_erms
														
 
															+.type    memset_sse2_erms, @function
														
 
															+.p2align 4
														
 
															+
														
 
															+memset_sse2_erms:
														
 
															+    // Fill all bytes of esi and xmm0 with the given character.
														
 
															+    movzx  esi, sil
														
 
															+    imul   esi, 0x01010101
														
 
															+    movd   xmm0, esi
														
 
															+    pshufd xmm0, xmm0, 0
														
 
															+
														
 
															+    // Store the original address for the return value.
														
 
															+    mov rax, rdi
														
 
															+
														
 
															+    cmp rdx, 64
														
 
															+    jb  .Lunder_64
														
 
															+
														
 
															+    // Limit taken from the article. Could be lower (256 or 512) if we want to
														
 
															+    // tune it for the latest CPUs.
														
 
															+    cmp rdx, 800
														
 
															+    jb  .Lbig
														
 
															+
														
 
															+.Lerms:
														
 
															+    // We're going to align the pointer to 64 bytes, and then use REP STOSB.
														
 
															+
														
 
															+    // Fill the first 64 bytes of the memory using SSE stores.
														
 
															+    movups [rdi], xmm0
														
 
															+    movups [rdi + 16], xmm0
														
 
															+    movups [rdi + 32], xmm0
														
 
															+    movups [rdi + 48], xmm0
														
 
															+
														
 
															+    // Store the address of the last byte in r8.
														
 
															+    lea r8, [rdi + rdx]
														
 
															+
														
 
															+    // Align the start pointer to 64 bytes.
														
 
															+    add rdi, 63
														
 
															+    and rdi, ~63
														
 
															+
														
 
															+    // Calculate the number of remaining bytes to store.
														
 
															+    mov rcx, r8
														
 
															+    sub rcx, rdi
														
 
															+
														
 
															+    // Use REP STOSB to fill the rest. This is implemented in microcode on
														
 
															+    // recent Intel and AMD CPUs, and can automatically use the widest stores
														
 
															+    // available in the CPU, so it's strictly faster than SSE for sizes of more
														
 
															+    // than a couple hundred bytes.
														
 
															+    xchg rax, rsi
														
 
															+    rep  stosb
														
 
															+    mov  rax, rsi
														
 
															+
														
 
															+    ret
														
 
															+
														
 
															+.global  memset_sse2
														
 
															+.type    memset_sse2, @function
														
 
															+.p2align 4
														
 
															+
														
 
															+memset_sse2:
														
 
															+    // Fill all bytes of esi and xmm0 with the given character.
														
 
															+    movzx  esi, sil
														
 
															+    imul   rsi, 0x01010101
														
 
															+    movd   xmm0, esi
														
 
															+    pshufd xmm0, xmm0, 0
														
 
															+
														
 
															+    // Store the original address for the return value.
														
 
															+    mov rax, rdi
														
 
															+
														
 
															+    cmp rdx, 64
														
 
															+    jb  .Lunder_64
														
 
															+
														
 
															+.Lbig:
														
 
															+    // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
														
 
															+    // loop, and then fill the last 48-64 bytes separately to take care of any
														
 
															+    // trailing bytes.
														
 
															+
														
 
															+    // Fill the first 16 bytes, which might be unaligned.
														
 
															+    movups [rdi], xmm0
														
 
															+
														
 
															+    // Calculate the first 16 byte aligned address for the SSE stores.
														
 
															+    lea rsi, [rdi + 16]
														
 
															+    and rsi, ~15
														
 
															+
														
 
															+    // Calculate the number of remaining bytes.
														
 
															+    sub rdi, rsi
														
 
															+    add rdx, rdi
														
 
															+
														
 
															+    // Calculate the last aligned address for trailing stores such that
														
 
															+    // 48-64 bytes are left.
														
 
															+    lea rcx, [rsi + rdx - 48]
														
 
															+    and rcx, ~15
														
 
															+
														
 
															+    // Calculate the address 16 bytes from the end.
														
 
															+    lea r8, [rsi + rdx - 16]
														
 
															+
														
 
															+    cmp rdx, 64
														
 
															+    jb  .Ltrailing
														
 
															+
														
 
															+.Lbig_loop:
														
 
															+    // Fill 4*16 bytes in a loop.
														
 
															+    movaps [rsi], xmm0
														
 
															+    movaps [rsi + 16], xmm0
														
 
															+    movaps [rsi + 32], xmm0
														
 
															+    movaps [rsi + 48], xmm0
														
 
															+
														
 
															+    add rsi, 64
														
 
															+    cmp rsi, rcx
														
 
															+    jb  .Lbig_loop
														
 
															+
														
 
															+.Ltrailing:
														
 
															+    // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
														
 
															+    movaps [rcx], xmm0
														
 
															+    movaps [rcx + 16], xmm0
														
 
															+    movaps [rcx + 32], xmm0
														
 
															+    movups [r8], xmm0
														
 
															+
														
 
															+    ret
														
 
															+
														
 
															+.Lunder_64:
														
 
															+    cmp rdx, 16
														
 
															+    jb  .Lunder_16
														
 
															+
														
 
															+    // We're going to fill 16-63 bytes using variable sized branchess stores.
														
 
															+    // Although this means that we might set the same byte up to 4 times, we
														
 
															+    // can avoid branching which is expensive compared to straight-line code.
														
 
															+
														
 
															+    // Calculate the address of the last SSE store.
														
 
															+    lea r8, [rdi + rdx - 16]
														
 
															+
														
 
															+    // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
														
 
															+    and rdx, 32
														
 
															+
														
 
															+    // Fill the first 16 bytes.
														
 
															+    movups [rdi], xmm0
														
 
															+
														
 
															+    // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
														
 
															+    shr rdx, 1
														
 
															+
														
 
															+    // Fill the last 16 bytes.
														
 
															+    movups [r8], xmm0
														
 
															+
														
 
															+    // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
														
 
															+    movups [rdi + rdx], xmm0
														
 
															+
														
 
															+    // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
														
 
															+    neg    rdx
														
 
															+    movups [r8 + rdx], xmm0
														
 
															+
														
 
															+    ret
														
 
															+
														
 
															+.Lunder_16:
														
 
															+    cmp rdx, 4
														
 
															+    jb  .Lunder_4
														
 
															+
														
 
															+    // We're going to fill 4-15 bytes using variable sized branchless stores like above.
														
 
															+    lea r8, [rdi + rdx - 4]
														
 
															+    and rdx, 8
														
 
															+    mov [rdi], esi
														
 
															+    shr rdx, 1
														
 
															+    mov [r8], esi
														
 
															+    mov [rdi + rdx], esi
														
 
															+    neg rdx
														
 
															+    mov [r8 + rdx], esi
														
 
															+    ret
														
 
															+
														
 
															+.Lunder_4:
														
 
															+    cmp rdx, 1
														
 
															+    jb  .Lend
														
 
															+
														
 
															+    // Fill the first byte.
														
 
															+    mov [rdi], sil
														
 
															+
														
 
															+    jbe .Lend
														
 
															+
														
 
															+    // The size is 2 or 3 bytes. Fill the second and the last one.
														
 
															+    mov [rdi + 1], sil
														
 
															+    mov [rdi + rdx - 1], sil
														
 
															+
														
 
															+.Lend:
														
 
															+    ret
														
--- a/Userland/Libraries/LibC/arch/x86_64/memset.cpp
+++ b/Userland/Libraries/LibC/arch/x86_64/memset.cpp
@@ -0,0 +1,59 @@
 
															+/*
														
 
															+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
														
 
															+ *
														
 
															+ * SPDX-License-Identifier: BSD-2-Clause
														
 
															+ */
														
 
															+
														
 
															+#include <AK/Types.h>
														
 
															+#include <cpuid.h>
														
 
															+#include <string.h>
														
 
															+
														
 
															+extern "C" {
														
 
															+
														
 
															+extern void* memset_sse2(void*, int, size_t);
														
 
															+extern void* memset_sse2_erms(void*, int, size_t);
														
 
															+
														
 
															+constexpr u32 tcg_signature_ebx = 0x54474354;
														
 
															+constexpr u32 tcg_signature_ecx = 0x43544743;
														
 
															+constexpr u32 tcg_signature_edx = 0x47435447;
														
 
															+
														
 
															+// Bit 9 of ebx in cpuid[eax = 7] indicates support for "Enhanced REP MOVSB/STOSB"
														
 
															+constexpr u32 cpuid_7_ebx_bit_erms = 1 << 9;
														
 
															+
														
 
															+namespace {
														
 
															+[[gnu::used]] decltype(&memset) resolve_memset()
														
 
															+{
														
 
															+    u32 eax, ebx, ecx, edx;
														
 
															+
														
 
															+    __cpuid(0x40000000, eax, ebx, ecx, edx);
														
 
															+    bool is_tcg = ebx == tcg_signature_ebx && ecx == tcg_signature_ecx && edx == tcg_signature_edx;
														
 
															+
														
 
															+    // Although TCG reports ERMS support, testing shows that rep stosb performs strictly worse than
														
 
															+    // SSE copies on all data sizes except <= 4 bytes.
														
 
															+    if (is_tcg)
														
 
															+        return memset_sse2;
														
 
															+
														
 
															+    __cpuid_count(7, 0, eax, ebx, ecx, edx);
														
 
															+    if (ebx & cpuid_7_ebx_bit_erms)
														
 
															+        return memset_sse2_erms;
														
 
															+
														
 
															+    return memset_sse2;
														
 
															+}
														
 
															+}
														
 
															+
														
 
															+#if !defined(__clang__) && !defined(_DYNAMIC_LOADER)
														
 
															+[[gnu::ifunc("resolve_memset")]] void* memset(void*, int, size_t);
														
 
															+#else
														
 
															+// DynamicLoader can't self-relocate IFUNCs.
														
 
															+// FIXME: There's a circular dependency between LibC and libunwind when built with Clang,
														
 
															+// so the IFUNC resolver could be called before LibC has been relocated, returning bogus addresses.
														
 
															+void* memset(void* dest_ptr, int c, size_t n)
														
 
															+{
														
 
															+    static decltype(&memset) s_impl = nullptr;
														
 
															+    if (s_impl == nullptr)
														
 
															+        s_impl = resolve_memset();
														
 
															+
														
 
															+    return s_impl(dest_ptr, c, n);
														
 
															+}
														
 
															+#endif
														
 
															+}
														
--- a/Userland/Libraries/LibC/string.cpp
+++ b/Userland/Libraries/LibC/string.cpp
@@ -137,7 +137,10 @@ void* memcpy(void* dest_ptr, void const* src_ptr, size_t n)
 
															     return original_dest;
														
 
															 }
														
 
															+#if ARCH(I386)
														
 
															 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memset.html
														
 
															+//
														
 
															+// For x86-64, an optimized ASM implementation is found in ./arch/x86_64/memset.S
														
 
															 void* memset(void* dest_ptr, int c, size_t n)
														
 
															 {
														
 
															     size_t dest = (size_t)dest_ptr;
														
@@ -145,19 +148,11 @@ void* memset(void* dest_ptr, int c, size_t n)
 
															     if (!(dest & 0x3) && n >= 12) {
														
 
															         size_t size_ts = n / sizeof(size_t);
														
 
															         size_t expanded_c = explode_byte((u8)c);
														
 
															-#if ARCH(I386)
														
 
															         asm volatile(
														
 
															             "rep stosl\n"
														
 
															             : "=D"(dest)
														
 
															             : "D"(dest), "c"(size_ts), "a"(expanded_c)
														
 
															             : "memory");
														
 
															-#else
														
 
															-        asm volatile(
														
 
															-            "rep stosq\n"
														
 
															-            : "=D"(dest)
														
 
															-            : "D"(dest), "c"(size_ts), "a"(expanded_c)
														
 
															-            : "memory");
														
 
															-#endif
														
 
															         n -= size_ts * sizeof(size_t);
														
 
															         if (n == 0)
														
 
															             return dest_ptr;
														
@@ -169,6 +164,7 @@ void* memset(void* dest_ptr, int c, size_t n)
 
															         : "memory");
														
 
															     return dest_ptr;
														
 
															 }
														
 
															+#endif
														
 
															 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/memmove.html
														
 
															 void* memmove(void* dest, void const* src, size_t n)