|
@@ -0,0 +1,196 @@
|
|
|
|
+/*
|
|
|
|
+ * Copyright (c) 2022, Daniel Bertalan <dani@danielbertalan.dev>
|
|
|
|
+ *
|
|
|
|
+ * SPDX-License-Identifier: BSD-2-Clause
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+// Optimized x86-64 memset routine based on the following post from the MSRC blog:
|
|
|
|
+// https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines
|
|
|
|
+//
|
|
|
|
+// This algorithm
|
|
|
|
+// - makes use of REP MOVSB on CPUs where it is fast (a notable exception is
|
|
|
|
+// qemu's TCG backend used in CI)
|
|
|
|
+// - uses SSE stores otherwise
|
|
|
|
+// - performs quick branchless stores for sizes < 64 bytes where REP STOSB would have
|
|
|
|
+// a large overhead
|
|
|
|
+
|
|
|
|
+.intel_syntax noprefix
|
|
|
|
+
|
|
|
|
+.global memset_sse2_erms
|
|
|
|
+.type memset_sse2_erms, @function
|
|
|
|
+.p2align 4
|
|
|
|
+
|
|
|
|
+memset_sse2_erms:
|
|
|
|
+ // Fill all bytes of esi and xmm0 with the given character.
|
|
|
|
+ movzx esi, sil
|
|
|
|
+ imul esi, 0x01010101
|
|
|
|
+ movd xmm0, esi
|
|
|
|
+ pshufd xmm0, xmm0, 0
|
|
|
|
+
|
|
|
|
+ // Store the original address for the return value.
|
|
|
|
+ mov rax, rdi
|
|
|
|
+
|
|
|
|
+ cmp rdx, 64
|
|
|
|
+ jb .Lunder_64
|
|
|
|
+
|
|
|
|
+ // Limit taken from the article. Could be lower (256 or 512) if we want to
|
|
|
|
+ // tune it for the latest CPUs.
|
|
|
|
+ cmp rdx, 800
|
|
|
|
+ jb .Lbig
|
|
|
|
+
|
|
|
|
+.Lerms:
|
|
|
|
+ // We're going to align the pointer to 64 bytes, and then use REP STOSB.
|
|
|
|
+
|
|
|
|
+ // Fill the first 64 bytes of the memory using SSE stores.
|
|
|
|
+ movups [rdi], xmm0
|
|
|
|
+ movups [rdi + 16], xmm0
|
|
|
|
+ movups [rdi + 32], xmm0
|
|
|
|
+ movups [rdi + 48], xmm0
|
|
|
|
+
|
|
|
|
+ // Store the address of the last byte in r8.
|
|
|
|
+ lea r8, [rdi + rdx]
|
|
|
|
+
|
|
|
|
+ // Align the start pointer to 64 bytes.
|
|
|
|
+ add rdi, 63
|
|
|
|
+ and rdi, ~63
|
|
|
|
+
|
|
|
|
+ // Calculate the number of remaining bytes to store.
|
|
|
|
+ mov rcx, r8
|
|
|
|
+ sub rcx, rdi
|
|
|
|
+
|
|
|
|
+ // Use REP STOSB to fill the rest. This is implemented in microcode on
|
|
|
|
+ // recent Intel and AMD CPUs, and can automatically use the widest stores
|
|
|
|
+ // available in the CPU, so it's strictly faster than SSE for sizes of more
|
|
|
|
+ // than a couple hundred bytes.
|
|
|
|
+ xchg rax, rsi
|
|
|
|
+ rep stosb
|
|
|
|
+ mov rax, rsi
|
|
|
|
+
|
|
|
|
+ ret
|
|
|
|
+
|
|
|
|
+.global memset_sse2
|
|
|
|
+.type memset_sse2, @function
|
|
|
|
+.p2align 4
|
|
|
|
+
|
|
|
|
+memset_sse2:
|
|
|
|
+ // Fill all bytes of esi and xmm0 with the given character.
|
|
|
|
+ movzx esi, sil
|
|
|
|
+ imul rsi, 0x01010101
|
|
|
|
+ movd xmm0, esi
|
|
|
|
+ pshufd xmm0, xmm0, 0
|
|
|
|
+
|
|
|
|
+ // Store the original address for the return value.
|
|
|
|
+ mov rax, rdi
|
|
|
|
+
|
|
|
|
+ cmp rdx, 64
|
|
|
|
+ jb .Lunder_64
|
|
|
|
+
|
|
|
|
+.Lbig:
|
|
|
|
+ // We're going to align the pointer to 16 bytes, fill 4*16 bytes in a hot
|
|
|
|
+ // loop, and then fill the last 48-64 bytes separately to take care of any
|
|
|
|
+ // trailing bytes.
|
|
|
|
+
|
|
|
|
+ // Fill the first 16 bytes, which might be unaligned.
|
|
|
|
+ movups [rdi], xmm0
|
|
|
|
+
|
|
|
|
+ // Calculate the first 16 byte aligned address for the SSE stores.
|
|
|
|
+ lea rsi, [rdi + 16]
|
|
|
|
+ and rsi, ~15
|
|
|
|
+
|
|
|
|
+ // Calculate the number of remaining bytes.
|
|
|
|
+ sub rdi, rsi
|
|
|
|
+ add rdx, rdi
|
|
|
|
+
|
|
|
|
+ // Calculate the last aligned address for trailing stores such that
|
|
|
|
+ // 48-64 bytes are left.
|
|
|
|
+ lea rcx, [rsi + rdx - 48]
|
|
|
|
+ and rcx, ~15
|
|
|
|
+
|
|
|
|
+ // Calculate the address 16 bytes from the end.
|
|
|
|
+ lea r8, [rsi + rdx - 16]
|
|
|
|
+
|
|
|
|
+ cmp rdx, 64
|
|
|
|
+ jb .Ltrailing
|
|
|
|
+
|
|
|
|
+.Lbig_loop:
|
|
|
|
+ // Fill 4*16 bytes in a loop.
|
|
|
|
+ movaps [rsi], xmm0
|
|
|
|
+ movaps [rsi + 16], xmm0
|
|
|
|
+ movaps [rsi + 32], xmm0
|
|
|
|
+ movaps [rsi + 48], xmm0
|
|
|
|
+
|
|
|
|
+ add rsi, 64
|
|
|
|
+ cmp rsi, rcx
|
|
|
|
+ jb .Lbig_loop
|
|
|
|
+
|
|
|
|
+.Ltrailing:
|
|
|
|
+ // We have 48-64 bytes left. Fill the first 48 and the last 16 bytes.
|
|
|
|
+ movaps [rcx], xmm0
|
|
|
|
+ movaps [rcx + 16], xmm0
|
|
|
|
+ movaps [rcx + 32], xmm0
|
|
|
|
+ movups [r8], xmm0
|
|
|
|
+
|
|
|
|
+ ret
|
|
|
|
+
|
|
|
|
+.Lunder_64:
|
|
|
|
+ cmp rdx, 16
|
|
|
|
+ jb .Lunder_16
|
|
|
|
+
|
|
|
|
+ // We're going to fill 16-63 bytes using variable sized branchess stores.
|
|
|
|
+ // Although this means that we might set the same byte up to 4 times, we
|
|
|
|
+ // can avoid branching which is expensive compared to straight-line code.
|
|
|
|
+
|
|
|
|
+ // Calculate the address of the last SSE store.
|
|
|
|
+ lea r8, [rdi + rdx - 16]
|
|
|
|
+
|
|
|
|
+ // Set rdx to 32 if there are >= 32 bytes, otherwise let its value be 0.
|
|
|
|
+ and rdx, 32
|
|
|
|
+
|
|
|
|
+ // Fill the first 16 bytes.
|
|
|
|
+ movups [rdi], xmm0
|
|
|
|
+
|
|
|
|
+ // Set rdx to 16 if there are >= 32 bytes, otherwise let its value be 0.
|
|
|
|
+ shr rdx, 1
|
|
|
|
+
|
|
|
|
+ // Fill the last 16 bytes.
|
|
|
|
+ movups [r8], xmm0
|
|
|
|
+
|
|
|
|
+ // Fill bytes 16 - 32 if there are more than 32 bytes, otherwise fill the first 16 again.
|
|
|
|
+ movups [rdi + rdx], xmm0
|
|
|
|
+
|
|
|
|
+ // Fill bytes (n-32) - (n-16) if there are n >= 32 bytes, otherwise fill the last 16 again.
|
|
|
|
+ neg rdx
|
|
|
|
+ movups [r8 + rdx], xmm0
|
|
|
|
+
|
|
|
|
+ ret
|
|
|
|
+
|
|
|
|
+.Lunder_16:
|
|
|
|
+ cmp rdx, 4
|
|
|
|
+ jb .Lunder_4
|
|
|
|
+
|
|
|
|
+ // We're going to fill 4-15 bytes using variable sized branchless stores like above.
|
|
|
|
+ lea r8, [rdi + rdx - 4]
|
|
|
|
+ and rdx, 8
|
|
|
|
+ mov [rdi], esi
|
|
|
|
+ shr rdx, 1
|
|
|
|
+ mov [r8], esi
|
|
|
|
+ mov [rdi + rdx], esi
|
|
|
|
+ neg rdx
|
|
|
|
+ mov [r8 + rdx], esi
|
|
|
|
+ ret
|
|
|
|
+
|
|
|
|
+.Lunder_4:
|
|
|
|
+ cmp rdx, 1
|
|
|
|
+ jb .Lend
|
|
|
|
+
|
|
|
|
+ // Fill the first byte.
|
|
|
|
+ mov [rdi], sil
|
|
|
|
+
|
|
|
|
+ jbe .Lend
|
|
|
|
+
|
|
|
|
+ // The size is 2 or 3 bytes. Fill the second and the last one.
|
|
|
|
+ mov [rdi + 1], sil
|
|
|
|
+ mov [rdi + rdx - 1], sil
|
|
|
|
+
|
|
|
|
+.Lend:
|
|
|
|
+ ret
|