From 766db673c18e4618e29ec192179503ebbb004517 Mon Sep 17 00:00:00 2001
From: Tom <tomut@yahoo.com>
Date: Wed, 2 Dec 2020 15:03:07 -0700
Subject: [PATCH] Kernel: Flush TLBs concurrently

Instead of flushing the TLB on the current processor first and then
notifying the other processors to do the same, notify the others
first, and while waiting on the others flush our own.
---
 Kernel/Arch/i386/CPU.cpp | 63 +++++++++++++++++++++++++---------------
 Kernel/Arch/i386/CPU.h   |  3 +-
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/Kernel/Arch/i386/CPU.cpp b/Kernel/Arch/i386/CPU.cpp
index eff69f96589..84ae27b59a0 100644
--- a/Kernel/Arch/i386/CPU.cpp
+++ b/Kernel/Arch/i386/CPU.cpp
@@ -1711,9 +1711,10 @@ void Processor::flush_tlb_local(VirtualAddress vaddr, size_t page_count)
 
 void Processor::flush_tlb(VirtualAddress vaddr, size_t page_count)
 {
-    flush_tlb_local(vaddr, page_count);
     if (s_smp_enabled)
         smp_broadcast_flush_tlb(vaddr, page_count);
+    else
+        flush_tlb_local(vaddr, page_count);
 }
 
 static volatile ProcessorMessage* s_message_pool;
@@ -1871,61 +1872,70 @@ bool Processor::smp_queue_message(ProcessorMessage& msg)
     return next == nullptr;
 }
 
-void Processor::smp_broadcast_message(ProcessorMessage& msg, bool async)
+void Processor::smp_broadcast_message(ProcessorMessage& msg)
 {
     auto& cur_proc = Processor::current();
-    msg.async = async;
 #ifdef SMP_DEBUG
     dbg() << "SMP[" << cur_proc.id() << "]: Broadcast message " << VirtualAddress(&msg) << " to cpus: " << (count()) << " proc: " << VirtualAddress(&cur_proc);
 #endif
     atomic_store(&msg.refs, count() - 1, AK::MemoryOrder::memory_order_release);
     ASSERT(msg.refs > 0);
+    bool need_broadcast = false;
     for_each(
         [&](Processor& proc) -> IterationDecision {
             if (&proc != &cur_proc) {
-                if (proc.smp_queue_message(msg)) {
-                    // TODO: only send IPI to that CPU if we queued the first
-                }
+                if (proc.smp_queue_message(msg))
+                    need_broadcast = true;
             }
             return IterationDecision::Continue;
         });
 
-    // Now trigger an IPI on all other APs
-    APIC::the().broadcast_ipi();
+    // Now trigger an IPI on all other APs (unless all targets already had messages queued)
+    if (need_broadcast)
+        APIC::the().broadcast_ipi();
+}
 
-    if (!async) {
-        // If synchronous then we must cleanup and return the message back
-        // to the pool. Otherwise, the last processor to complete it will return it
-        while (atomic_load(&msg.refs, AK::MemoryOrder::memory_order_consume) != 0) {
-            // TODO: pause for a bit?
+void Processor::smp_broadcast_wait_sync(ProcessorMessage& msg)
+{
+    auto& cur_proc = Processor::current();
+    ASSERT(!msg.async);
+    // If synchronous then we must cleanup and return the message back
+    // to the pool. Otherwise, the last processor to complete it will return it
+    while (atomic_load(&msg.refs, AK::MemoryOrder::memory_order_consume) != 0) {
+        // TODO: pause for a bit?
 
-            // We need to process any messages that may have been sent to
-            // us while we're waiting. This also checks if another processor
-            // may have requested us to halt.
-            cur_proc.smp_process_pending_messages();
-        }
-
-        smp_cleanup_message(msg);
-        smp_return_to_pool(msg);
+        // We need to process any messages that may have been sent to
+        // us while we're waiting. This also checks if another processor
+        // may have requested us to halt.
+        cur_proc.smp_process_pending_messages();
     }
+
+    smp_cleanup_message(msg);
+    smp_return_to_pool(msg);
 }
 
 void Processor::smp_broadcast(void (*callback)(void*), void* data, void (*free_data)(void*), bool async)
 {
     auto& msg = smp_get_from_pool();
+    msg.async = async;
     msg.type = ProcessorMessage::CallbackWithData;
     msg.callback_with_data.handler = callback;
     msg.callback_with_data.data = data;
     msg.callback_with_data.free = free_data;
-    smp_broadcast_message(msg, async);
+    smp_broadcast_message(msg);
+    if (!async)
+        smp_broadcast_wait_sync(msg);
 }
 
 void Processor::smp_broadcast(void (*callback)(), bool async)
 {
     auto& msg = smp_get_from_pool();
+    msg.async = async;
     msg.type = ProcessorMessage::CallbackWithData;
     msg.callback.handler = callback;
-    smp_broadcast_message(msg, async);
+    smp_broadcast_message(msg);
+    if (!async)
+        smp_broadcast_wait_sync(msg);
 }
 
 void Processor::smp_unicast_message(u32 cpu, ProcessorMessage& msg, bool async)
@@ -1980,10 +1990,15 @@ void Processor::smp_unicast(u32 cpu, void (*callback)(), bool async)
 void Processor::smp_broadcast_flush_tlb(VirtualAddress vaddr, size_t page_count)
 {
     auto& msg = smp_get_from_pool();
+    msg.async = false;
     msg.type = ProcessorMessage::FlushTlb;
     msg.flush_tlb.ptr = vaddr.as_ptr();
     msg.flush_tlb.page_count = page_count;
-    smp_broadcast_message(msg, false);
+    smp_broadcast_message(msg);
+    // While the other processors handle this request, we'll flush ours
+    flush_tlb_local(vaddr, page_count);
+    // Now wait until everybody is done as well
+    smp_broadcast_wait_sync(msg);
 }
 
 void Processor::smp_broadcast_halt()
diff --git a/Kernel/Arch/i386/CPU.h b/Kernel/Arch/i386/CPU.h
index 9f6a6b9d1c5..ace5e0fc688 100644
--- a/Kernel/Arch/i386/CPU.h
+++ b/Kernel/Arch/i386/CPU.h
@@ -740,7 +740,8 @@ class Processor {
     static void smp_cleanup_message(ProcessorMessage& msg);
     bool smp_queue_message(ProcessorMessage& msg);
     static void smp_unicast_message(u32 cpu, ProcessorMessage& msg, bool async);
-    static void smp_broadcast_message(ProcessorMessage& msg, bool async);
+    static void smp_broadcast_message(ProcessorMessage& msg);
+    static void smp_broadcast_wait_sync(ProcessorMessage& msg);
     static void smp_broadcast_halt();
 
     void deferred_call_pool_init();