Explorar o código

Kernel: Implement and use the syscall/sysret instruction pair on x86_64

Owen Smith %!s(int64=4) %!d(string=hai) anos
pai
achega
e6df1c9988

+ 35 - 0
Kernel/API/Syscall.h

@@ -483,10 +483,17 @@ int sync();
 inline uintptr_t invoke(Function function)
 {
     uintptr_t result;
+#        if ARCH(I386)
     asm volatile("int $0x82"
                  : "=a"(result)
                  : "a"(function)
                  : "memory");
+#        else
+    asm volatile("syscall"
+                 : "=a"(result)
+                 : "a"(function)
+                 : "rcx", "r11", "memory");
+#        endif
     return result;
 }
 
@@ -494,10 +501,17 @@ template<typename T1>
 inline uintptr_t invoke(Function function, T1 arg1)
 {
     uintptr_t result;
+#        if ARCH(I386)
     asm volatile("int $0x82"
                  : "=a"(result)
                  : "a"(function), "d"((uintptr_t)arg1)
                  : "memory");
+#        else
+    asm volatile("syscall"
+                 : "=a"(result)
+                 : "a"(function), "d"((uintptr_t)arg1)
+                 : "rcx", "r11", "memory");
+#        endif
     return result;
 }
 
@@ -505,10 +519,17 @@ template<typename T1, typename T2>
 inline uintptr_t invoke(Function function, T1 arg1, T2 arg2)
 {
     uintptr_t result;
+#        if ARCH(I386)
     asm volatile("int $0x82"
                  : "=a"(result)
                  : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2)
                  : "memory");
+#        else
+    asm volatile("syscall"
+                 : "=a"(result)
+                 : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2)
+                 : "rcx", "r11", "memory");
+#        endif
     return result;
 }
 
@@ -516,10 +537,17 @@ template<typename T1, typename T2, typename T3>
 inline uintptr_t invoke(Function function, T1 arg1, T2 arg2, T3 arg3)
 {
     uintptr_t result;
+#        if ARCH(I386)
     asm volatile("int $0x82"
                  : "=a"(result)
                  : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2), "b"((uintptr_t)arg3)
                  : "memory");
+#        else
+    asm volatile("syscall"
+                 : "=a"(result)
+                 : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2), "b"((uintptr_t)arg3)
+                 : "rcx", "r11", "memory");
+#        endif
     return result;
 }
 
@@ -527,10 +555,17 @@ template<typename T1, typename T2, typename T3, typename T4>
 inline uintptr_t invoke(Function function, T1 arg1, T2 arg2, T3 arg3, T4 arg4)
 {
     uintptr_t result;
+#        if ARCH(I386)
     asm volatile("int $0x82"
                  : "=a"(result)
                  : "a"(function), "d"((uintptr_t)arg1), "c"((uintptr_t)arg2), "b"((uintptr_t)arg3), "S"((uintptr_t)arg4)
                  : "memory");
+#        else
+    asm volatile("syscall"
+                 : "=a"(result)
+                 : "a"(function), "d"((uintptr_t)arg1), "D"((uintptr_t)arg2), "b"((uintptr_t)arg3), "S"((uintptr_t)arg4)
+                 : "memory");
+#        endif
     return result;
 }
 #    endif

+ 20 - 0
Kernel/Arch/x86/Processor.h

@@ -30,6 +30,10 @@ struct ProcessorMessage;
 struct ProcessorMessageEntry;
 
 #if ARCH(X86_64)
+#    define MSR_EFER 0xc0000080
+#    define MSR_STAR 0xc0000081
+#    define MSR_LSTAR 0xc0000082
+#    define MSR_SFMASK 0xc0000084
 #    define MSR_FS_BASE 0xc0000100
 #    define MSR_GS_BASE 0xc0000101
 #endif
@@ -58,6 +62,11 @@ class Processor {
 
     Processor* m_self;
 
+#if ARCH(X86_64)
+    // Saved user stack for the syscall instruction.
+    void* m_user_stack;
+#endif
+
     DescriptorTablePointer m_gdtr;
     Descriptor m_gdt[256];
     u32 m_gdt_length;
@@ -205,6 +214,17 @@ public:
 
     static bool is_smp_enabled();
 
+#if ARCH(X86_64)
+    static constexpr u64 user_stack_offset()
+    {
+        return __builtin_offsetof(Processor, m_user_stack);
+    }
+    static constexpr u64 kernel_stack_offset()
+    {
+        return __builtin_offsetof(Processor, m_tss) + __builtin_offsetof(TSS, rsp0l);
+    }
+#endif
+
     ALWAYS_INLINE static Processor& current()
     {
         return *(Processor*)read_gs_ptr(__builtin_offsetof(Processor, m_self));

+ 2 - 1
Kernel/Arch/x86/RegisterState.h

@@ -110,9 +110,10 @@ struct [[gnu::packed]] RegisterState {
         arg3 = ebx;
         arg4 = esi;
 #else
+        // The syscall instruction clobbers rcx, so we must use a different calling convention to 32-bit.
         function = rax;
         arg1 = rdx;
-        arg2 = rcx;
+        arg2 = rdi;
         arg3 = rbx;
         arg4 = rsi;
 #endif

+ 23 - 0
Kernel/Arch/x86/common/Processor.cpp

@@ -45,6 +45,7 @@ Atomic<u32> Processor::s_idle_cpu_mask { 0 };
 extern "C" void context_first_init(Thread* from_thread, Thread* to_thread, TrapFrame* trap) __attribute__((used));
 extern "C" void enter_thread_context(Thread* from_thread, Thread* to_thread) __attribute__((used));
 extern "C" FlatPtr do_init_context(Thread* thread, u32 flags) __attribute__((used));
+extern "C" void syscall_entry();
 
 bool Processor::is_smp_enabled()
 {
@@ -220,6 +221,28 @@ UNMAP_AFTER_INIT void Processor::cpu_setup()
             write_xcr0(read_xcr0() | 0x7);
         }
     }
+
+#if ARCH(X86_64)
+    // x86_64 processors must have the syscall feature.
+    VERIFY(has_feature(CPUFeature::SYSCALL));
+    MSR efer_msr(MSR_EFER);
+    efer_msr.set(efer_msr.get() | 1u);
+
+    // Write code and stack selectors to the STAR MSR. The first value stored in bits 63:48 controls the sysret CS (value + 0x10) and SS (value + 0x8),
+    // and the value stored in bits 47:32 controls the syscall CS (value) and SS (value + 0x8).
+    u64 star = 0;
+    star |= 0x13ul << 48u;
+    star |= 0x08ul << 32u;
+    MSR star_msr(MSR_STAR);
+    star_msr.set(star);
+
+    // Write the syscall entry point to the LSTAR MSR, and write the SFMASK MSR to clear rflags upon entry.
+    // The userspace rflags will be preserved in r11.
+    MSR lstar_msr(MSR_LSTAR);
+    MSR sfmask_msr(MSR_SFMASK);
+    lstar_msr.set(reinterpret_cast<u64>(&syscall_entry));
+    sfmask_msr.set(~0x2);
+#endif
 }
 
 String Processor::features_string() const

+ 82 - 0
Kernel/Arch/x86/x86_64/SyscallEntry.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021, Owen Smith <yeeetari@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <Kernel/Arch/x86/DescriptorTable.h>
+#include <Kernel/Arch/x86/Processor.h>
+#include <Kernel/Arch/x86/TrapFrame.h>
+
+extern "C" void syscall_entry();
+extern "C" [[gnu::naked]] void syscall_entry()
+{
+    // clang-format off
+    asm(
+        // Store the user stack, then switch to the kernel stack.
+        "    movq %%rsp, %%gs:%c[user_stack] \n"
+        "    movq %%gs:%c[kernel_stack], %%rsp \n"
+
+        // Build RegisterState.
+        "    pushq $0x1b \n" // User ss
+        "    pushq %%gs:%c[user_stack] \n" // User rsp
+        "    sti \n" // It's now safe to enable interrupts, but we can't index into gs after this point
+        "    pushq %%r11 \n" // The CPU preserves the user rflags in r11
+        "    pushq $0x23 \n" // User cs
+        "    pushq %%rcx \n" // The CPU preserves the user IP in rcx
+        "    pushq $0 \n"
+        "    pushq %%r15 \n"
+        "    pushq %%r14 \n"
+        "    pushq %%r13 \n"
+        "    pushq %%r12 \n"
+        "    pushq %%r11 \n"
+        "    pushq %%r10 \n"
+        "    pushq %%r9 \n"
+        "    pushq %%r8 \n"
+        "    pushq %%rax \n"
+        "    pushq %%rcx \n"
+        "    pushq %%rdx \n"
+        "    pushq %%rbx \n"
+        "    pushq %%rsp \n"
+        "    pushq %%rbp \n"
+        "    pushq %%rsi \n"
+        "    pushq %%rdi \n"
+
+        "    pushq %%rsp \n" // TrapFrame::regs
+        "    subq $" __STRINGIFY(TRAP_FRAME_SIZE - 8) ", %%rsp \n"
+        "    movq %%rsp, %%rdi \n"
+        "    call enter_trap_no_irq \n"
+        "    movq %%rsp, %%rdi \n"
+        "    call syscall_handler \n"
+        "    movq %%rsp, %%rdi \n"
+        "    call exit_trap \n"
+        "    addq $" __STRINGIFY(TRAP_FRAME_SIZE) ", %%rsp \n" // Pop TrapFrame
+
+        "    popq %%rdi \n"
+        "    popq %%rsi \n"
+        "    popq %%rbp \n"
+        "    addq $8, %%rsp \n" // Skip restoring kernel rsp
+        "    popq %%rbx \n"
+        "    popq %%rdx \n"
+        "    popq %%rcx \n"
+        "    popq %%rax \n"
+        "    popq %%r8 \n"
+        "    popq %%r9 \n"
+        "    popq %%r10 \n"
+        "    popq %%r11 \n"
+        "    popq %%r12 \n"
+        "    popq %%r13 \n"
+        "    popq %%r14 \n"
+        "    popq %%r15 \n"
+        "    addq $8, %%rsp \n"
+        "    popq %%rcx \n"
+        "    addq $16, %%rsp \n"
+
+        // Disable interrupts before we restore the user stack pointer. sysret will re-enable interrupts when it restores
+        // rflags.
+        "    cli \n"
+        "    popq %%rsp \n"
+        "    sysretq \n"
+    :: [user_stack] "i"(Kernel::Processor::user_stack_offset()), [kernel_stack] "i"(Kernel::Processor::kernel_stack_offset()));
+    // clang-format on
+}

+ 7 - 0
Kernel/CMakeLists.txt

@@ -311,6 +311,13 @@ if ("${SERENITY_ARCH}" STREQUAL "i686" OR "${SERENITY_ARCH}" STREQUAL "x86_64")
         ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/common/SafeMem.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/common/TrapFrame.cpp
     )
+
+    if("${SERENITY_ARCH}" STREQUAL "x86_64")
+        set(KERNEL_SOURCES
+            ${KERNEL_SOURCES}
+            ${CMAKE_CURRENT_SOURCE_DIR}/Arch/x86/${KERNEL_ARCH}/SyscallEntry.cpp
+        )
+    endif()
 endif()
 
 set(AK_SOURCES