From 1f159eaab0ad6791f35842eb3e6f3c746f3101aa Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Thu, 7 Feb 2019 08:46:52 +0100 Subject: [PATCH] Add a fast memcpy() using MMX when we're moving >= 1KB. This is a nice speedup for WindowServer. I'll eventually have to do this with SSE but the kernel doesn't support SSE yet so this is it for now. --- AK/StdLibExtras.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++ AK/StdLibExtras.h | 6 ++++++ Kernel/Makefile | 3 ++- Kernel/StdLib.cpp | 8 +++++++- LibC/Makefile | 1 + LibC/string.cpp | 5 ++++- 6 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 AK/StdLibExtras.cpp diff --git a/AK/StdLibExtras.cpp b/AK/StdLibExtras.cpp new file mode 100644 index 00000000000..5db3444ee84 --- /dev/null +++ b/AK/StdLibExtras.cpp @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +void* mmx_memcpy(void* dest, const void* src, size_t len) +{ + ASSERT(len >= 1024); + + auto* dest_ptr = (byte*)dest; + auto* src_ptr = (const byte*)src; + + if ((dword)dest_ptr & 7) { + dword prologue = 8 - ((dword)dest_ptr & 7); + asm volatile( + "rep movsb\n" + :: "S"(src_ptr), "D"(dest_ptr), "c"(prologue) + : "memory" + ); + len -= prologue; + } + for (dword i = len / 64; i; --i) { + asm volatile( + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + "movq %%mm0, (%1)\n" + "movq %%mm1, 8(%1)\n" + "movq %%mm2, 16(%1)\n" + "movq %%mm3, 24(%1)\n" + "movq %%mm4, 32(%1)\n" + "movq %%mm5, 40(%1)\n" + "movq %%mm6, 48(%1)\n" + "movq %%mm7, 56(%1)\n" + :: "r" (src_ptr), "r" (dest_ptr) : "memory"); + src_ptr += 64; + dest_ptr += 64; + } + asm volatile("emms":::"memory"); + // Whatever remains we'll have to memcpy. + len %= 64; + if (len) + memcpy(dest_ptr, src_ptr, len); + return dest; +} diff --git a/AK/StdLibExtras.h b/AK/StdLibExtras.h index 7d68c5d0023..b5e7515d291 100644 --- a/AK/StdLibExtras.h +++ b/AK/StdLibExtras.h @@ -9,8 +9,14 @@ #include +void* mmx_memcpy(void* to, const void* from, size_t); + ALWAYS_INLINE void fast_dword_copy(dword* dest, const dword* src, size_t count) { + if (count >= 256) { + mmx_memcpy(dest, src, count * sizeof(count)); + return; + } asm volatile( "rep movsl\n" : "=S"(src), "=D"(dest), "=c"(count) diff --git a/Kernel/Makefile b/Kernel/Makefile index 6f7e858de8d..d92e06325a6 100644 --- a/Kernel/Makefile +++ b/Kernel/Makefile @@ -69,7 +69,8 @@ AK_OBJS = \ ../AK/String.o \ ../AK/StringImpl.o \ ../AK/StringBuilder.o \ - ../AK/FileSystemPath.o + ../AK/FileSystemPath.o \ + ../AK/StdLibExtras.o OBJS = $(KERNEL_OBJS) $(VFS_OBJS) $(AK_OBJS) $(WINDOWSERVER_OBJS) $(SHAREDGRAPHICS_OBJS) diff --git a/Kernel/StdLib.cpp b/Kernel/StdLib.cpp index 59673d7ba01..55ed583d5f7 100644 --- a/Kernel/StdLib.cpp +++ b/Kernel/StdLib.cpp @@ -1,12 +1,18 @@ #include "types.h" #include "Assertions.h" #include "kmalloc.h" +#include #include extern "C" { -void memcpy(void *dest_ptr, const void *src_ptr, dword n) +void memcpy(void* dest_ptr, const void* src_ptr, dword n) { + if (n >= 1024) { + mmx_memcpy(dest_ptr, src_ptr, n); + return; + } + dword dest = (dword)dest_ptr; dword src = (dword)src_ptr; // FIXME: Support starting at an unaligned address. diff --git a/LibC/Makefile b/LibC/Makefile index fb07e474c61..e44257aacbf 100644 --- a/LibC/Makefile +++ b/LibC/Makefile @@ -3,6 +3,7 @@ AK_OBJS = \ ../AK/String.o \ ../AK/StringBuilder.o \ ../AK/FileSystemPath.o \ + ../AK/StdLibExtras.o \ ../AK/kmalloc.o SHAREDGRAPHICS_OBJS = \ diff --git a/LibC/string.cpp b/LibC/string.cpp index 47ddf45e3fe..2c0ea877536 100644 --- a/LibC/string.cpp +++ b/LibC/string.cpp @@ -102,8 +102,11 @@ int memcmp(const void* v1, const void* v2, size_t n) return 0; } -void* memcpy(void *dest_ptr, const void *src_ptr, dword n) +void* memcpy(void* dest_ptr, const void* src_ptr, dword n) { + if (n >= 1024) + return mmx_memcpy(dest_ptr, src_ptr, n); + dword dest = (dword)dest_ptr; dword src = (dword)src_ptr; // FIXME: Support starting at an unaligned address.