Processor.cpp 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415
  1. /*
  2. * Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. * Copyright (c) 2022, the SerenityOS developers.
  5. *
  6. * SPDX-License-Identifier: BSD-2-Clause
  7. */
  8. #include <AK/BuiltinWrappers.h>
  9. #include <AK/Format.h>
  10. #include <AK/StdLibExtras.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/Types.h>
  13. #include <Kernel/Interrupts/APIC.h>
  14. #include <Kernel/Process.h>
  15. #include <Kernel/Scheduler.h>
  16. #include <Kernel/Sections.h>
  17. #include <Kernel/StdLib.h>
  18. #include <Kernel/Thread.h>
  19. #include <Kernel/Arch/Processor.h>
  20. #include <Kernel/Arch/ScopedCritical.h>
  21. #include <Kernel/Arch/x86/CPUID.h>
  22. #include <Kernel/Arch/x86/InterruptDisabler.h>
  23. #include <Kernel/Arch/x86/Interrupts.h>
  24. #include <Kernel/Arch/x86/MSR.h>
  25. #include <Kernel/Arch/x86/ProcessorInfo.h>
  26. #include <Kernel/Arch/x86/SafeMem.h>
  27. #include <Kernel/Arch/x86/TrapFrame.h>
  28. #include <Kernel/Memory/PageDirectory.h>
  29. #include <Kernel/Memory/ScopedAddressSpaceSwitcher.h>
  30. namespace Kernel {
  31. READONLY_AFTER_INIT FPUState Processor::s_clean_fpu_state;
  32. READONLY_AFTER_INIT static ProcessorContainer s_processors {};
  33. READONLY_AFTER_INIT Atomic<u32> Processor::g_total_processors;
  34. READONLY_AFTER_INIT static volatile bool s_smp_enabled;
  35. static Atomic<ProcessorMessage*> s_message_pool;
  36. Atomic<u32> Processor::s_idle_cpu_mask { 0 };
  37. // The compiler can't see the calls to these functions inside assembly.
  38. // Declare them, to avoid dead code warnings.
  39. extern "C" void context_first_init(Thread* from_thread, Thread* to_thread, TrapFrame* trap) __attribute__((used));
  40. extern "C" void enter_thread_context(Thread* from_thread, Thread* to_thread) __attribute__((used));
  41. extern "C" FlatPtr do_init_context(Thread* thread, u32 flags) __attribute__((used));
  42. extern "C" void syscall_entry();
  43. bool Processor::is_smp_enabled()
  44. {
  45. return s_smp_enabled;
  46. }
  47. UNMAP_AFTER_INIT static void sse_init()
  48. {
  49. write_cr0((read_cr0() & 0xfffffffbu) | 0x2);
  50. write_cr4(read_cr4() | 0x600);
  51. }
  52. void exit_kernel_thread(void)
  53. {
  54. Thread::current()->exit();
  55. }
  56. UNMAP_AFTER_INIT void Processor::cpu_detect()
  57. {
  58. // NOTE: This is called during Processor::early_initialize, we cannot
  59. // safely log at this point because we don't have kmalloc
  60. // initialized yet!
  61. m_features = CPUFeature::Type(0u);
  62. CPUID processor_info(0x1);
  63. auto handle_edx_bit_11_feature = [&] {
  64. u32 stepping = processor_info.eax() & 0xf;
  65. u32 model = (processor_info.eax() >> 4) & 0xf;
  66. u32 family = (processor_info.eax() >> 8) & 0xf;
  67. // FIXME: I have no clue what these mean or where it's from (the Intel manual I've seen just says EDX[11] is SEP).
  68. // If you do, please convert them to constants or add comments!
  69. if (!(family == 6 && model < 3 && stepping < 3))
  70. m_features |= CPUFeature::SEP;
  71. if ((family == 6 && model >= 3) || (family == 0xf && model >= 0xe))
  72. m_features |= CPUFeature::CONSTANT_TSC;
  73. };
  74. if (processor_info.ecx() & (1 << 0))
  75. m_features |= CPUFeature::SSE3;
  76. if (processor_info.ecx() & (1 << 9))
  77. m_features |= CPUFeature::SSSE3;
  78. if (processor_info.ecx() & (1 << 19))
  79. m_features |= CPUFeature::SSE4_1;
  80. if (processor_info.ecx() & (1 << 20))
  81. m_features |= CPUFeature::SSE4_2;
  82. if (processor_info.ecx() & (1 << 26))
  83. m_features |= CPUFeature::XSAVE;
  84. if (processor_info.ecx() & (1 << 28))
  85. m_features |= CPUFeature::AVX;
  86. if (processor_info.ecx() & (1 << 30))
  87. m_features |= CPUFeature::RDRAND;
  88. if (processor_info.ecx() & (1 << 31))
  89. m_features |= CPUFeature::HYPERVISOR;
  90. if (processor_info.edx() & (1 << 4))
  91. m_features |= CPUFeature::TSC;
  92. if (processor_info.edx() & (1 << 6))
  93. m_features |= CPUFeature::PAE;
  94. if (processor_info.edx() & (1 << 13))
  95. m_features |= CPUFeature::PGE;
  96. if (processor_info.edx() & (1 << 11))
  97. handle_edx_bit_11_feature();
  98. if (processor_info.edx() & (1 << 16))
  99. m_features |= CPUFeature::PAT;
  100. if (processor_info.edx() & (1 << 23))
  101. m_features |= CPUFeature::MMX;
  102. if (processor_info.edx() & (1 << 24))
  103. m_features |= CPUFeature::FXSR;
  104. if (processor_info.edx() & (1 << 25))
  105. m_features |= CPUFeature::SSE;
  106. if (processor_info.edx() & (1 << 26))
  107. m_features |= CPUFeature::SSE2;
  108. CPUID extended_features(0x7);
  109. if (extended_features.ebx() & (1 << 7))
  110. m_features |= CPUFeature::SMEP;
  111. if (extended_features.ebx() & (1 << 18))
  112. m_features |= CPUFeature::RDSEED;
  113. if (extended_features.ebx() & (1 << 20))
  114. m_features |= CPUFeature::SMAP;
  115. if (extended_features.ecx() & (1 << 2))
  116. m_features |= CPUFeature::UMIP;
  117. u32 max_extended_leaf = CPUID(0x80000000).eax();
  118. if (max_extended_leaf >= 0x80000001) {
  119. CPUID extended_processor_info(0x80000001);
  120. if (extended_processor_info.edx() & (1 << 11))
  121. m_features |= CPUFeature::SYSCALL; // Only available in 64 bit mode
  122. if (extended_processor_info.edx() & (1 << 20))
  123. m_features |= CPUFeature::NX;
  124. if (extended_processor_info.edx() & (1 << 27))
  125. m_features |= CPUFeature::RDTSCP;
  126. if (extended_processor_info.edx() & (1 << 29))
  127. m_features |= CPUFeature::LM;
  128. }
  129. if (max_extended_leaf >= 0x80000007) {
  130. CPUID cpuid(0x80000007);
  131. if (cpuid.edx() & (1 << 8)) {
  132. m_features |= CPUFeature::CONSTANT_TSC;
  133. m_features |= CPUFeature::NONSTOP_TSC;
  134. }
  135. }
  136. #if ARCH(X86_64)
  137. m_has_qemu_hvf_quirk = false;
  138. #endif
  139. if (max_extended_leaf >= 0x80000008) {
  140. // CPUID.80000008H:EAX[7:0] reports the physical-address width supported by the processor.
  141. CPUID cpuid(0x80000008);
  142. m_physical_address_bit_width = cpuid.eax() & 0xff;
  143. // CPUID.80000008H:EAX[15:8] reports the linear-address width supported by the processor.
  144. m_virtual_address_bit_width = (cpuid.eax() >> 8) & 0xff;
  145. } else {
  146. // For processors that do not support CPUID function 80000008H, the width is generally 36 if CPUID.01H:EDX.PAE [bit 6] = 1 and 32 otherwise.
  147. m_physical_address_bit_width = has_feature(CPUFeature::PAE) ? 36 : 32;
  148. // Processors that do not support CPUID function 80000008H, support a linear-address width of 32.
  149. m_virtual_address_bit_width = 32;
  150. #if ARCH(X86_64)
  151. // Workaround QEMU hypervisor.framework bug
  152. // https://gitlab.com/qemu-project/qemu/-/issues/664
  153. //
  154. // We detect this as follows:
  155. // * We're in a hypervisor
  156. // * hypervisor_leaf_range is null under Hypervisor.framework
  157. // * m_physical_address_bit_width is 36 bits
  158. if (has_feature(CPUFeature::HYPERVISOR)) {
  159. CPUID hypervisor_leaf_range(0x40000000);
  160. if (!hypervisor_leaf_range.ebx() && m_physical_address_bit_width == 36) {
  161. m_has_qemu_hvf_quirk = true;
  162. m_virtual_address_bit_width = 48;
  163. }
  164. }
  165. #endif
  166. }
  167. }
  168. UNMAP_AFTER_INIT void Processor::cpu_setup()
  169. {
  170. // NOTE: This is called during Processor::early_initialize, we cannot
  171. // safely log at this point because we don't have kmalloc
  172. // initialized yet!
  173. cpu_detect();
  174. if (has_feature(CPUFeature::SSE)) {
  175. // enter_thread_context() assumes that if a x86 CPU supports SSE then it also supports FXSR.
  176. // SSE support without FXSR is an extremely unlikely scenario, so let's be pragmatic about it.
  177. VERIFY(has_feature(CPUFeature::FXSR));
  178. sse_init();
  179. }
  180. write_cr0(read_cr0() | 0x00010000);
  181. if (has_feature(CPUFeature::PGE)) {
  182. // Turn on CR4.PGE so the CPU will respect the G bit in page tables.
  183. write_cr4(read_cr4() | 0x80);
  184. }
  185. if (has_feature(CPUFeature::NX)) {
  186. // Turn on IA32_EFER.NXE
  187. MSR ia32_efer(MSR_IA32_EFER);
  188. ia32_efer.set(ia32_efer.get() | 0x800);
  189. }
  190. if (has_feature(CPUFeature::PAT)) {
  191. MSR ia32_pat(MSR_IA32_PAT);
  192. // Set PA4 to Write Comine. This allows us to
  193. // use this mode by only setting the bit in the PTE
  194. // and leaving all other bits in the upper levels unset,
  195. // which maps to setting bit 3 of the index, resulting
  196. // in the index value 0 or 4.
  197. u64 pat = ia32_pat.get() & ~(0x7ull << 32);
  198. pat |= 0x1ull << 32; // set WC mode for PA4
  199. ia32_pat.set(pat);
  200. }
  201. if (has_feature(CPUFeature::SMEP)) {
  202. // Turn on CR4.SMEP
  203. write_cr4(read_cr4() | 0x100000);
  204. }
  205. if (has_feature(CPUFeature::SMAP)) {
  206. // Turn on CR4.SMAP
  207. write_cr4(read_cr4() | 0x200000);
  208. }
  209. if (has_feature(CPUFeature::UMIP)) {
  210. write_cr4(read_cr4() | 0x800);
  211. }
  212. if (has_feature(CPUFeature::TSC)) {
  213. write_cr4(read_cr4() | 0x4);
  214. }
  215. if (has_feature(CPUFeature::XSAVE)) {
  216. // Turn on CR4.OSXSAVE
  217. write_cr4(read_cr4() | 0x40000);
  218. // According to the Intel manual: "After reset, all bits (except bit 0) in XCR0 are cleared to zero; XCR0[0] is set to 1."
  219. // Sadly we can't trust this, for example VirtualBox starts with bits 0-4 set, so let's do it ourselves.
  220. write_xcr0(0x1);
  221. if (has_feature(CPUFeature::AVX)) {
  222. // Turn on SSE, AVX and x87 flags
  223. write_xcr0(read_xcr0() | 0x7);
  224. }
  225. }
  226. #if ARCH(X86_64)
  227. // x86_64 processors must support the syscall feature.
  228. VERIFY(has_feature(CPUFeature::SYSCALL));
  229. MSR efer_msr(MSR_EFER);
  230. efer_msr.set(efer_msr.get() | 1u);
  231. // Write code and stack selectors to the STAR MSR. The first value stored in bits 63:48 controls the sysret CS (value + 0x10) and SS (value + 0x8),
  232. // and the value stored in bits 47:32 controls the syscall CS (value) and SS (value + 0x8).
  233. u64 star = 0;
  234. star |= 0x13ul << 48u;
  235. star |= 0x08ul << 32u;
  236. MSR star_msr(MSR_STAR);
  237. star_msr.set(star);
  238. // Write the syscall entry point to the LSTAR MSR.
  239. MSR lstar_msr(MSR_LSTAR);
  240. lstar_msr.set(reinterpret_cast<u64>(&syscall_entry));
  241. // Write the SFMASK MSR. This MSR controls which bits of rflags are masked when a syscall instruction is executed -
  242. // if a bit is set in sfmask, the corresponding bit in rflags is cleared. The value set here clears most of rflags,
  243. // but keeps the reserved and virtualization bits intact. The userspace rflags value is saved in r11 by syscall.
  244. constexpr u64 rflags_mask = 0x257fd5u;
  245. MSR sfmask_msr(MSR_SFMASK);
  246. sfmask_msr.set(rflags_mask);
  247. #endif
  248. }
  249. NonnullOwnPtr<KString> Processor::features_string() const
  250. {
  251. StringBuilder builder;
  252. auto feature_to_str = [](CPUFeature::Type const& feature) -> StringView {
  253. if (feature == CPUFeature::NX)
  254. return "nx"sv;
  255. if (feature == CPUFeature::PAE)
  256. return "pae"sv;
  257. if (feature == CPUFeature::PGE)
  258. return "pge"sv;
  259. if (feature == CPUFeature::RDRAND)
  260. return "rdrand"sv;
  261. if (feature == CPUFeature::RDSEED)
  262. return "rdseed"sv;
  263. if (feature == CPUFeature::SMAP)
  264. return "smap"sv;
  265. if (feature == CPUFeature::SMEP)
  266. return "smep"sv;
  267. if (feature == CPUFeature::SSE)
  268. return "sse"sv;
  269. if (feature == CPUFeature::TSC)
  270. return "tsc"sv;
  271. if (feature == CPUFeature::RDTSCP)
  272. return "rdtscp"sv;
  273. if (feature == CPUFeature::CONSTANT_TSC)
  274. return "constant_tsc"sv;
  275. if (feature == CPUFeature::NONSTOP_TSC)
  276. return "nonstop_tsc"sv;
  277. if (feature == CPUFeature::UMIP)
  278. return "umip"sv;
  279. if (feature == CPUFeature::SEP)
  280. return "sep"sv;
  281. if (feature == CPUFeature::SYSCALL)
  282. return "syscall"sv;
  283. if (feature == CPUFeature::MMX)
  284. return "mmx"sv;
  285. if (feature == CPUFeature::FXSR)
  286. return "fxsr"sv;
  287. if (feature == CPUFeature::SSE2)
  288. return "sse2"sv;
  289. if (feature == CPUFeature::SSE3)
  290. return "sse3"sv;
  291. if (feature == CPUFeature::SSSE3)
  292. return "ssse3"sv;
  293. if (feature == CPUFeature::SSE4_1)
  294. return "sse4.1"sv;
  295. if (feature == CPUFeature::SSE4_2)
  296. return "sse4.2"sv;
  297. if (feature == CPUFeature::XSAVE)
  298. return "xsave"sv;
  299. if (feature == CPUFeature::AVX)
  300. return "avx"sv;
  301. if (feature == CPUFeature::LM)
  302. return "lm"sv;
  303. if (feature == CPUFeature::HYPERVISOR)
  304. return "hypervisor"sv;
  305. if (feature == CPUFeature::PAT)
  306. return "pat"sv;
  307. VERIFY_NOT_REACHED();
  308. };
  309. bool first = true;
  310. for (auto feature = CPUFeature::Type(1u); feature != CPUFeature::__End; feature <<= 1u) {
  311. if (has_feature(feature)) {
  312. if (first)
  313. first = false;
  314. else
  315. MUST(builder.try_append(' '));
  316. auto str = feature_to_str(feature);
  317. MUST(builder.try_append(str));
  318. }
  319. }
  320. return KString::must_create(builder.string_view());
  321. }
  322. UNMAP_AFTER_INIT void Processor::early_initialize(u32 cpu)
  323. {
  324. m_self = this;
  325. m_cpu = cpu;
  326. m_in_irq = 0;
  327. m_in_critical = 0;
  328. m_invoke_scheduler_async = false;
  329. m_scheduler_initialized = false;
  330. m_in_scheduler = true;
  331. m_message_queue = nullptr;
  332. m_idle_thread = nullptr;
  333. m_current_thread = nullptr;
  334. m_info = nullptr;
  335. m_halt_requested = false;
  336. if (cpu == 0) {
  337. s_smp_enabled = false;
  338. g_total_processors.store(1u, AK::MemoryOrder::memory_order_release);
  339. } else {
  340. g_total_processors.fetch_add(1u, AK::MemoryOrder::memory_order_acq_rel);
  341. }
  342. deferred_call_pool_init();
  343. cpu_setup();
  344. gdt_init();
  345. VERIFY(is_initialized()); // sanity check
  346. VERIFY(&current() == this); // sanity check
  347. }
  348. UNMAP_AFTER_INIT void Processor::initialize(u32 cpu)
  349. {
  350. VERIFY(m_self == this);
  351. VERIFY(&current() == this); // sanity check
  352. dmesgln("CPU[{}]: Supported features: {}", current_id(), features_string());
  353. if (!has_feature(CPUFeature::RDRAND))
  354. dmesgln("CPU[{}]: No RDRAND support detected, randomness will be poor", current_id());
  355. dmesgln("CPU[{}]: Physical address bit width: {}", current_id(), m_physical_address_bit_width);
  356. dmesgln("CPU[{}]: Virtual address bit width: {}", current_id(), m_virtual_address_bit_width);
  357. #if ARCH(X86_64)
  358. if (m_has_qemu_hvf_quirk)
  359. dmesgln("CPU[{}]: Applied correction for QEMU Hypervisor.framework quirk", current_id());
  360. #endif
  361. if (cpu == 0)
  362. idt_init();
  363. else
  364. flush_idt();
  365. if (cpu == 0) {
  366. VERIFY((FlatPtr(&s_clean_fpu_state) & 0xF) == 0);
  367. asm volatile("fninit");
  368. if (has_feature(CPUFeature::FXSR))
  369. asm volatile("fxsave %0"
  370. : "=m"(s_clean_fpu_state));
  371. else
  372. asm volatile("fnsave %0"
  373. : "=m"(s_clean_fpu_state));
  374. if (has_feature(CPUFeature::HYPERVISOR))
  375. detect_hypervisor();
  376. }
  377. m_info = new ProcessorInfo(*this);
  378. {
  379. // We need to prevent races between APs starting up at the same time
  380. VERIFY(cpu < s_processors.size());
  381. s_processors[cpu] = this;
  382. }
  383. }
  384. UNMAP_AFTER_INIT void Processor::detect_hypervisor()
  385. {
  386. CPUID hypervisor_leaf_range(0x40000000);
  387. // Get signature of hypervisor.
  388. alignas(sizeof(u32)) char hypervisor_signature_buffer[13];
  389. *reinterpret_cast<u32*>(hypervisor_signature_buffer) = hypervisor_leaf_range.ebx();
  390. *reinterpret_cast<u32*>(hypervisor_signature_buffer + 4) = hypervisor_leaf_range.ecx();
  391. *reinterpret_cast<u32*>(hypervisor_signature_buffer + 8) = hypervisor_leaf_range.edx();
  392. hypervisor_signature_buffer[12] = '\0';
  393. StringView hypervisor_signature(hypervisor_signature_buffer);
  394. dmesgln("CPU[{}]: CPUID hypervisor signature '{}' ({:#x} {:#x} {:#x}), max leaf {:#x}", current_id(), hypervisor_signature, hypervisor_leaf_range.ebx(), hypervisor_leaf_range.ecx(), hypervisor_leaf_range.edx(), hypervisor_leaf_range.eax());
  395. if (hypervisor_signature == "Microsoft Hv"sv)
  396. detect_hypervisor_hyperv(hypervisor_leaf_range);
  397. }
  398. UNMAP_AFTER_INIT void Processor::detect_hypervisor_hyperv(CPUID const& hypervisor_leaf_range)
  399. {
  400. if (hypervisor_leaf_range.eax() < 0x40000001)
  401. return;
  402. CPUID hypervisor_interface(0x40000001);
  403. // Get signature of hypervisor interface.
  404. alignas(sizeof(u32)) char interface_signature_buffer[5];
  405. *reinterpret_cast<u32*>(interface_signature_buffer) = hypervisor_interface.eax();
  406. interface_signature_buffer[4] = '\0';
  407. StringView hyperv_interface_signature(interface_signature_buffer);
  408. dmesgln("CPU[{}]: Hyper-V interface signature '{}' ({:#x})", current_id(), hyperv_interface_signature, hypervisor_interface.eax());
  409. if (hypervisor_leaf_range.eax() < 0x40000001)
  410. return;
  411. CPUID hypervisor_sysid(0x40000002);
  412. dmesgln("CPU[{}]: Hyper-V system identity {}.{}, build number {}", current_id(), hypervisor_sysid.ebx() >> 16, hypervisor_sysid.ebx() & 0xFFFF, hypervisor_sysid.eax());
  413. if (hypervisor_leaf_range.eax() < 0x40000005 || hyperv_interface_signature != "Hv#1"sv)
  414. return;
  415. dmesgln("CPU[{}]: Hyper-V hypervisor detected", current_id());
  416. // TODO: Actually do something with Hyper-V.
  417. }
  418. void Processor::write_raw_gdt_entry(u16 selector, u32 low, u32 high)
  419. {
  420. u16 i = (selector & 0xfffc) >> 3;
  421. u32 prev_gdt_length = m_gdt_length;
  422. if (i >= m_gdt_length) {
  423. m_gdt_length = i + 1;
  424. VERIFY(m_gdt_length <= sizeof(m_gdt) / sizeof(m_gdt[0]));
  425. m_gdtr.limit = (m_gdt_length + 1) * 8 - 1;
  426. }
  427. m_gdt[i].low = low;
  428. m_gdt[i].high = high;
  429. // clear selectors we may have skipped
  430. for (auto j = prev_gdt_length; j < i; ++j) {
  431. m_gdt[j].low = 0;
  432. m_gdt[j].high = 0;
  433. }
  434. }
  435. void Processor::write_gdt_entry(u16 selector, Descriptor& descriptor)
  436. {
  437. write_raw_gdt_entry(selector, descriptor.low, descriptor.high);
  438. }
  439. Descriptor& Processor::get_gdt_entry(u16 selector)
  440. {
  441. u16 i = (selector & 0xfffc) >> 3;
  442. return *(Descriptor*)(&m_gdt[i]);
  443. }
  444. void Processor::flush_gdt()
  445. {
  446. m_gdtr.address = m_gdt;
  447. m_gdtr.limit = (m_gdt_length * 8) - 1;
  448. asm volatile("lgdt %0" ::"m"(m_gdtr)
  449. : "memory");
  450. }
  451. const DescriptorTablePointer& Processor::get_gdtr()
  452. {
  453. return m_gdtr;
  454. }
  455. ErrorOr<Vector<FlatPtr, 32>> Processor::capture_stack_trace(Thread& thread, size_t max_frames)
  456. {
  457. FlatPtr frame_ptr = 0, ip = 0;
  458. Vector<FlatPtr, 32> stack_trace;
  459. auto walk_stack = [&](FlatPtr stack_ptr) -> ErrorOr<void> {
  460. constexpr size_t max_stack_frames = 4096;
  461. bool is_walking_userspace_stack = false;
  462. TRY(stack_trace.try_append(ip));
  463. size_t count = 1;
  464. while (stack_ptr && stack_trace.size() < max_stack_frames) {
  465. FlatPtr retaddr;
  466. count++;
  467. if (max_frames != 0 && count > max_frames)
  468. break;
  469. if (!Memory::is_user_address(VirtualAddress { stack_ptr })) {
  470. if (is_walking_userspace_stack) {
  471. dbgln("SHENANIGANS! Userspace stack points back into kernel memory");
  472. break;
  473. }
  474. } else {
  475. is_walking_userspace_stack = true;
  476. }
  477. if (Memory::is_user_range(VirtualAddress(stack_ptr), sizeof(FlatPtr) * 2)) {
  478. if (copy_from_user(&retaddr, &((FlatPtr*)stack_ptr)[1]).is_error() || !retaddr)
  479. break;
  480. TRY(stack_trace.try_append(retaddr));
  481. if (copy_from_user(&stack_ptr, (FlatPtr*)stack_ptr).is_error())
  482. break;
  483. } else {
  484. void* fault_at;
  485. if (!safe_memcpy(&retaddr, &((FlatPtr*)stack_ptr)[1], sizeof(FlatPtr), fault_at) || !retaddr)
  486. break;
  487. TRY(stack_trace.try_append(retaddr));
  488. if (!safe_memcpy(&stack_ptr, (FlatPtr*)stack_ptr, sizeof(FlatPtr), fault_at))
  489. break;
  490. }
  491. }
  492. return {};
  493. };
  494. auto capture_current_thread = [&]() {
  495. frame_ptr = (FlatPtr)__builtin_frame_address(0);
  496. ip = (FlatPtr)__builtin_return_address(0);
  497. return walk_stack(frame_ptr);
  498. };
  499. // Since the thread may be running on another processor, there
  500. // is a chance a context switch may happen while we're trying
  501. // to get it. It also won't be entirely accurate and merely
  502. // reflect the status at the last context switch.
  503. SpinlockLocker lock(g_scheduler_lock);
  504. if (&thread == Processor::current_thread()) {
  505. VERIFY(thread.state() == Thread::State::Running);
  506. // Leave the scheduler lock. If we trigger page faults we may
  507. // need to be preempted. Since this is our own thread it won't
  508. // cause any problems as the stack won't change below this frame.
  509. lock.unlock();
  510. TRY(capture_current_thread());
  511. } else if (thread.is_active()) {
  512. VERIFY(thread.cpu() != Processor::current_id());
  513. // If this is the case, the thread is currently running
  514. // on another processor. We can't trust the kernel stack as
  515. // it may be changing at any time. We need to probably send
  516. // an IPI to that processor, have it walk the stack and wait
  517. // until it returns the data back to us
  518. auto& proc = Processor::current();
  519. ErrorOr<void> result;
  520. smp_unicast(
  521. thread.cpu(),
  522. [&]() {
  523. dbgln("CPU[{}] getting stack for cpu #{}", Processor::current_id(), proc.id());
  524. ScopedAddressSpaceSwitcher switcher(thread.process());
  525. VERIFY(&Processor::current() != &proc);
  526. VERIFY(&thread == Processor::current_thread());
  527. // NOTE: Because the other processor is still holding the
  528. // scheduler lock while waiting for this callback to finish,
  529. // the current thread on the target processor cannot change
  530. // TODO: What to do about page faults here? We might deadlock
  531. // because the other processor is still holding the
  532. // scheduler lock...
  533. result = capture_current_thread();
  534. },
  535. false);
  536. TRY(result);
  537. } else {
  538. switch (thread.state()) {
  539. case Thread::State::Running:
  540. VERIFY_NOT_REACHED(); // should have been handled above
  541. case Thread::State::Runnable:
  542. case Thread::State::Stopped:
  543. case Thread::State::Blocked:
  544. case Thread::State::Dying:
  545. case Thread::State::Dead: {
  546. // We need to retrieve ebp from what was last pushed to the kernel
  547. // stack. Before switching out of that thread, it switch_context
  548. // pushed the callee-saved registers, and the last of them happens
  549. // to be ebp.
  550. ScopedAddressSpaceSwitcher switcher(thread.process());
  551. auto& regs = thread.regs();
  552. auto* stack_top = reinterpret_cast<FlatPtr*>(regs.sp());
  553. if (Memory::is_user_range(VirtualAddress(stack_top), sizeof(FlatPtr))) {
  554. if (copy_from_user(&frame_ptr, &((FlatPtr*)stack_top)[0]).is_error())
  555. frame_ptr = 0;
  556. } else {
  557. void* fault_at;
  558. if (!safe_memcpy(&frame_ptr, &((FlatPtr*)stack_top)[0], sizeof(FlatPtr), fault_at))
  559. frame_ptr = 0;
  560. }
  561. ip = regs.ip();
  562. // TODO: We need to leave the scheduler lock here, but we also
  563. // need to prevent the target thread from being run while
  564. // we walk the stack
  565. lock.unlock();
  566. TRY(walk_stack(frame_ptr));
  567. break;
  568. }
  569. default:
  570. dbgln("Cannot capture stack trace for thread {} in state {}", thread, thread.state_string());
  571. break;
  572. }
  573. }
  574. return stack_trace;
  575. }
  576. ProcessorContainer& Processor::processors()
  577. {
  578. return s_processors;
  579. }
  580. Processor& Processor::by_id(u32 id)
  581. {
  582. return *s_processors[id];
  583. }
  584. void Processor::enter_trap(TrapFrame& trap, bool raise_irq)
  585. {
  586. VERIFY_INTERRUPTS_DISABLED();
  587. VERIFY(&Processor::current() == this);
  588. trap.prev_irq_level = m_in_irq;
  589. if (raise_irq)
  590. m_in_irq++;
  591. auto* current_thread = Processor::current_thread();
  592. if (current_thread) {
  593. auto& current_trap = current_thread->current_trap();
  594. trap.next_trap = current_trap;
  595. current_trap = &trap;
  596. // The cs register of this trap tells us where we will return back to
  597. auto new_previous_mode = ((trap.regs->cs & 3) != 0) ? Thread::PreviousMode::UserMode : Thread::PreviousMode::KernelMode;
  598. if (current_thread->set_previous_mode(new_previous_mode) && trap.prev_irq_level == 0) {
  599. current_thread->update_time_scheduled(Scheduler::current_time(), new_previous_mode == Thread::PreviousMode::KernelMode, false);
  600. }
  601. } else {
  602. trap.next_trap = nullptr;
  603. }
  604. }
  605. void Processor::exit_trap(TrapFrame& trap)
  606. {
  607. VERIFY_INTERRUPTS_DISABLED();
  608. VERIFY(&Processor::current() == this);
  609. // Temporarily enter a critical section. This is to prevent critical
  610. // sections entered and left within e.g. smp_process_pending_messages
  611. // to trigger a context switch while we're executing this function
  612. // See the comment at the end of the function why we don't use
  613. // ScopedCritical here.
  614. m_in_critical = m_in_critical + 1;
  615. VERIFY(m_in_irq >= trap.prev_irq_level);
  616. m_in_irq = trap.prev_irq_level;
  617. if (s_smp_enabled)
  618. smp_process_pending_messages();
  619. // Process the deferred call queue. Among other things, this ensures
  620. // that any pending thread unblocks happen before we enter the scheduler.
  621. deferred_call_execute_pending();
  622. auto* current_thread = Processor::current_thread();
  623. if (current_thread) {
  624. auto& current_trap = current_thread->current_trap();
  625. current_trap = trap.next_trap;
  626. Thread::PreviousMode new_previous_mode;
  627. if (current_trap) {
  628. VERIFY(current_trap->regs);
  629. // If we have another higher level trap then we probably returned
  630. // from an interrupt or irq handler. The cs register of the
  631. // new/higher level trap tells us what the mode prior to it was
  632. new_previous_mode = ((current_trap->regs->cs & 3) != 0) ? Thread::PreviousMode::UserMode : Thread::PreviousMode::KernelMode;
  633. } else {
  634. // If we don't have a higher level trap then we're back in user mode.
  635. // Which means that the previous mode prior to being back in user mode was kernel mode
  636. new_previous_mode = Thread::PreviousMode::KernelMode;
  637. }
  638. if (current_thread->set_previous_mode(new_previous_mode))
  639. current_thread->update_time_scheduled(Scheduler::current_time(), true, false);
  640. }
  641. VERIFY_INTERRUPTS_DISABLED();
  642. // Leave the critical section without actually enabling interrupts.
  643. // We don't want context switches to happen until we're explicitly
  644. // triggering a switch in check_invoke_scheduler.
  645. m_in_critical = m_in_critical - 1;
  646. if (!m_in_irq && !m_in_critical)
  647. check_invoke_scheduler();
  648. }
  649. void Processor::check_invoke_scheduler()
  650. {
  651. InterruptDisabler disabler;
  652. VERIFY(!m_in_irq);
  653. VERIFY(!m_in_critical);
  654. VERIFY(&Processor::current() == this);
  655. if (m_invoke_scheduler_async && m_scheduler_initialized) {
  656. m_invoke_scheduler_async = false;
  657. Scheduler::invoke_async();
  658. }
  659. }
  660. void Processor::flush_tlb_local(VirtualAddress vaddr, size_t page_count)
  661. {
  662. auto ptr = vaddr.as_ptr();
  663. while (page_count > 0) {
  664. // clang-format off
  665. asm volatile("invlpg %0"
  666. :
  667. : "m"(*ptr)
  668. : "memory");
  669. // clang-format on
  670. ptr += PAGE_SIZE;
  671. page_count--;
  672. }
  673. }
  674. void Processor::flush_tlb(Memory::PageDirectory const* page_directory, VirtualAddress vaddr, size_t page_count)
  675. {
  676. if (s_smp_enabled && (!Memory::is_user_address(vaddr) || Process::current().thread_count() > 1))
  677. smp_broadcast_flush_tlb(page_directory, vaddr, page_count);
  678. else
  679. flush_tlb_local(vaddr, page_count);
  680. }
  681. void Processor::smp_return_to_pool(ProcessorMessage& msg)
  682. {
  683. ProcessorMessage* next = nullptr;
  684. for (;;) {
  685. msg.next = next;
  686. if (s_message_pool.compare_exchange_strong(next, &msg, AK::MemoryOrder::memory_order_acq_rel))
  687. break;
  688. Processor::pause();
  689. }
  690. }
  691. ProcessorMessage& Processor::smp_get_from_pool()
  692. {
  693. ProcessorMessage* msg;
  694. // The assumption is that messages are never removed from the pool!
  695. for (;;) {
  696. msg = s_message_pool.load(AK::MemoryOrder::memory_order_consume);
  697. if (!msg) {
  698. if (!Processor::current().smp_process_pending_messages()) {
  699. Processor::pause();
  700. }
  701. continue;
  702. }
  703. // If another processor were to use this message in the meanwhile,
  704. // "msg" is still valid (because it never gets freed). We'd detect
  705. // this because the expected value "msg" and pool would
  706. // no longer match, and the compare_exchange will fail. But accessing
  707. // "msg->next" is always safe here.
  708. if (s_message_pool.compare_exchange_strong(msg, msg->next, AK::MemoryOrder::memory_order_acq_rel)) {
  709. // We successfully "popped" this available message
  710. break;
  711. }
  712. }
  713. VERIFY(msg != nullptr);
  714. return *msg;
  715. }
  716. u32 Processor::smp_wake_n_idle_processors(u32 wake_count)
  717. {
  718. VERIFY_INTERRUPTS_DISABLED();
  719. VERIFY(wake_count > 0);
  720. if (!s_smp_enabled)
  721. return 0;
  722. // Wake at most N - 1 processors
  723. if (wake_count >= Processor::count()) {
  724. wake_count = Processor::count() - 1;
  725. VERIFY(wake_count > 0);
  726. }
  727. u32 current_id = Processor::current_id();
  728. u32 did_wake_count = 0;
  729. auto& apic = APIC::the();
  730. while (did_wake_count < wake_count) {
  731. // Try to get a set of idle CPUs and flip them to busy
  732. u32 idle_mask = s_idle_cpu_mask.load(AK::MemoryOrder::memory_order_relaxed) & ~(1u << current_id);
  733. u32 idle_count = popcount(idle_mask);
  734. if (idle_count == 0)
  735. break; // No (more) idle processor available
  736. u32 found_mask = 0;
  737. for (u32 i = 0; i < idle_count; i++) {
  738. u32 cpu = bit_scan_forward(idle_mask) - 1;
  739. idle_mask &= ~(1u << cpu);
  740. found_mask |= 1u << cpu;
  741. }
  742. idle_mask = s_idle_cpu_mask.fetch_and(~found_mask, AK::MemoryOrder::memory_order_acq_rel) & found_mask;
  743. if (idle_mask == 0)
  744. continue; // All of them were flipped to busy, try again
  745. idle_count = popcount(idle_mask);
  746. for (u32 i = 0; i < idle_count; i++) {
  747. u32 cpu = bit_scan_forward(idle_mask) - 1;
  748. idle_mask &= ~(1u << cpu);
  749. // Send an IPI to that CPU to wake it up. There is a possibility
  750. // someone else woke it up as well, or that it woke up due to
  751. // a timer interrupt. But we tried hard to avoid this...
  752. apic.send_ipi(cpu);
  753. did_wake_count++;
  754. }
  755. }
  756. return did_wake_count;
  757. }
  758. UNMAP_AFTER_INIT void Processor::smp_enable()
  759. {
  760. size_t msg_pool_size = Processor::count() * 100u;
  761. size_t msg_entries_cnt = Processor::count();
  762. auto msgs = new ProcessorMessage[msg_pool_size];
  763. auto msg_entries = new ProcessorMessageEntry[msg_pool_size * msg_entries_cnt];
  764. size_t msg_entry_i = 0;
  765. for (size_t i = 0; i < msg_pool_size; i++, msg_entry_i += msg_entries_cnt) {
  766. auto& msg = msgs[i];
  767. msg.next = i < msg_pool_size - 1 ? &msgs[i + 1] : nullptr;
  768. msg.per_proc_entries = &msg_entries[msg_entry_i];
  769. for (size_t k = 0; k < msg_entries_cnt; k++)
  770. msg_entries[msg_entry_i + k].msg = &msg;
  771. }
  772. s_message_pool.store(&msgs[0], AK::MemoryOrder::memory_order_release);
  773. // Start sending IPI messages
  774. s_smp_enabled = true;
  775. }
  776. void Processor::smp_cleanup_message(ProcessorMessage& msg)
  777. {
  778. switch (msg.type) {
  779. case ProcessorMessage::Callback:
  780. msg.callback_value().~Function();
  781. break;
  782. default:
  783. break;
  784. }
  785. }
  786. bool Processor::smp_process_pending_messages()
  787. {
  788. VERIFY(s_smp_enabled);
  789. bool did_process = false;
  790. enter_critical();
  791. if (auto pending_msgs = m_message_queue.exchange(nullptr, AK::MemoryOrder::memory_order_acq_rel)) {
  792. // We pulled the stack of pending messages in LIFO order, so we need to reverse the list first
  793. auto reverse_list =
  794. [](ProcessorMessageEntry* list) -> ProcessorMessageEntry* {
  795. ProcessorMessageEntry* rev_list = nullptr;
  796. while (list) {
  797. auto next = list->next;
  798. list->next = rev_list;
  799. rev_list = list;
  800. list = next;
  801. }
  802. return rev_list;
  803. };
  804. pending_msgs = reverse_list(pending_msgs);
  805. // now process in the right order
  806. ProcessorMessageEntry* next_msg;
  807. for (auto cur_msg = pending_msgs; cur_msg; cur_msg = next_msg) {
  808. next_msg = cur_msg->next;
  809. auto msg = cur_msg->msg;
  810. dbgln_if(SMP_DEBUG, "SMP[{}]: Processing message {}", current_id(), VirtualAddress(msg));
  811. switch (msg->type) {
  812. case ProcessorMessage::Callback:
  813. msg->invoke_callback();
  814. break;
  815. case ProcessorMessage::FlushTlb:
  816. if (Memory::is_user_address(VirtualAddress(msg->flush_tlb.ptr))) {
  817. // We assume that we don't cross into kernel land!
  818. VERIFY(Memory::is_user_range(VirtualAddress(msg->flush_tlb.ptr), msg->flush_tlb.page_count * PAGE_SIZE));
  819. if (read_cr3() != msg->flush_tlb.page_directory->cr3()) {
  820. // This processor isn't using this page directory right now, we can ignore this request
  821. dbgln_if(SMP_DEBUG, "SMP[{}]: No need to flush {} pages at {}", current_id(), msg->flush_tlb.page_count, VirtualAddress(msg->flush_tlb.ptr));
  822. break;
  823. }
  824. }
  825. flush_tlb_local(VirtualAddress(msg->flush_tlb.ptr), msg->flush_tlb.page_count);
  826. break;
  827. }
  828. bool is_async = msg->async; // Need to cache this value *before* dropping the ref count!
  829. auto prev_refs = msg->refs.fetch_sub(1u, AK::MemoryOrder::memory_order_acq_rel);
  830. VERIFY(prev_refs != 0);
  831. if (prev_refs == 1) {
  832. // All processors handled this. If this is an async message,
  833. // we need to clean it up and return it to the pool
  834. if (is_async) {
  835. smp_cleanup_message(*msg);
  836. smp_return_to_pool(*msg);
  837. }
  838. }
  839. if (m_halt_requested.load(AK::MemoryOrder::memory_order_relaxed))
  840. halt_this();
  841. }
  842. did_process = true;
  843. } else if (m_halt_requested.load(AK::MemoryOrder::memory_order_relaxed)) {
  844. halt_this();
  845. }
  846. leave_critical();
  847. return did_process;
  848. }
  849. bool Processor::smp_enqueue_message(ProcessorMessage& msg)
  850. {
  851. // Note that it's quite possible that the other processor may pop
  852. // the queue at any given time. We rely on the fact that the messages
  853. // are pooled and never get freed!
  854. auto& msg_entry = msg.per_proc_entries[id()];
  855. VERIFY(msg_entry.msg == &msg);
  856. ProcessorMessageEntry* next = nullptr;
  857. for (;;) {
  858. msg_entry.next = next;
  859. if (m_message_queue.compare_exchange_strong(next, &msg_entry, AK::MemoryOrder::memory_order_acq_rel))
  860. break;
  861. Processor::pause();
  862. }
  863. // If the enqueued message was the only message in the queue when posted,
  864. // we return true. This is used by callers when deciding whether to generate an IPI.
  865. return next == nullptr;
  866. }
  867. void Processor::smp_broadcast_message(ProcessorMessage& msg)
  868. {
  869. auto& current_processor = Processor::current();
  870. dbgln_if(SMP_DEBUG, "SMP[{}]: Broadcast message {} to cpus: {} processor: {}", current_processor.id(), VirtualAddress(&msg), count(), VirtualAddress(&current_processor));
  871. msg.refs.store(count() - 1, AK::MemoryOrder::memory_order_release);
  872. VERIFY(msg.refs > 0);
  873. bool need_broadcast = false;
  874. for_each(
  875. [&](Processor& proc) {
  876. if (&proc != &current_processor) {
  877. if (proc.smp_enqueue_message(msg))
  878. need_broadcast = true;
  879. }
  880. });
  881. // Now trigger an IPI on all other APs (unless all targets already had messages queued)
  882. if (need_broadcast)
  883. APIC::the().broadcast_ipi();
  884. }
  885. void Processor::smp_broadcast_wait_sync(ProcessorMessage& msg)
  886. {
  887. auto& cur_proc = Processor::current();
  888. VERIFY(!msg.async);
  889. // If synchronous then we must cleanup and return the message back
  890. // to the pool. Otherwise, the last processor to complete it will return it
  891. while (msg.refs.load(AK::MemoryOrder::memory_order_consume) != 0) {
  892. Processor::pause();
  893. // We need to process any messages that may have been sent to
  894. // us while we're waiting. This also checks if another processor
  895. // may have requested us to halt.
  896. cur_proc.smp_process_pending_messages();
  897. }
  898. smp_cleanup_message(msg);
  899. smp_return_to_pool(msg);
  900. }
  901. void Processor::smp_unicast_message(u32 cpu, ProcessorMessage& msg, bool async)
  902. {
  903. auto& current_processor = Processor::current();
  904. VERIFY(cpu != current_processor.id());
  905. auto& target_processor = processors()[cpu];
  906. msg.async = async;
  907. dbgln_if(SMP_DEBUG, "SMP[{}]: Send message {} to cpu #{} processor: {}", current_processor.id(), VirtualAddress(&msg), cpu, VirtualAddress(&target_processor));
  908. msg.refs.store(1u, AK::MemoryOrder::memory_order_release);
  909. if (target_processor->smp_enqueue_message(msg)) {
  910. APIC::the().send_ipi(cpu);
  911. }
  912. if (!async) {
  913. // If synchronous then we must cleanup and return the message back
  914. // to the pool. Otherwise, the last processor to complete it will return it
  915. while (msg.refs.load(AK::MemoryOrder::memory_order_consume) != 0) {
  916. Processor::pause();
  917. // We need to process any messages that may have been sent to
  918. // us while we're waiting. This also checks if another processor
  919. // may have requested us to halt.
  920. current_processor.smp_process_pending_messages();
  921. }
  922. smp_cleanup_message(msg);
  923. smp_return_to_pool(msg);
  924. }
  925. }
  926. void Processor::smp_unicast(u32 cpu, Function<void()> callback, bool async)
  927. {
  928. auto& msg = smp_get_from_pool();
  929. msg.type = ProcessorMessage::Callback;
  930. new (msg.callback_storage) ProcessorMessage::CallbackFunction(move(callback));
  931. smp_unicast_message(cpu, msg, async);
  932. }
  933. void Processor::smp_broadcast_flush_tlb(Memory::PageDirectory const* page_directory, VirtualAddress vaddr, size_t page_count)
  934. {
  935. auto& msg = smp_get_from_pool();
  936. msg.async = false;
  937. msg.type = ProcessorMessage::FlushTlb;
  938. msg.flush_tlb.page_directory = page_directory;
  939. msg.flush_tlb.ptr = vaddr.as_ptr();
  940. msg.flush_tlb.page_count = page_count;
  941. smp_broadcast_message(msg);
  942. // While the other processors handle this request, we'll flush ours
  943. flush_tlb_local(vaddr, page_count);
  944. // Now wait until everybody is done as well
  945. smp_broadcast_wait_sync(msg);
  946. }
  947. void Processor::smp_broadcast_halt()
  948. {
  949. // We don't want to use a message, because this could have been triggered
  950. // by being out of memory and we might not be able to get a message
  951. for_each(
  952. [&](Processor& proc) {
  953. proc.m_halt_requested.store(true, AK::MemoryOrder::memory_order_release);
  954. });
  955. // Now trigger an IPI on all other APs
  956. APIC::the().broadcast_ipi();
  957. }
  958. void Processor::Processor::halt()
  959. {
  960. if (s_smp_enabled)
  961. smp_broadcast_halt();
  962. halt_this();
  963. }
  964. UNMAP_AFTER_INIT void Processor::deferred_call_pool_init()
  965. {
  966. size_t pool_count = sizeof(m_deferred_call_pool) / sizeof(m_deferred_call_pool[0]);
  967. for (size_t i = 0; i < pool_count; i++) {
  968. auto& entry = m_deferred_call_pool[i];
  969. entry.next = i < pool_count - 1 ? &m_deferred_call_pool[i + 1] : nullptr;
  970. new (entry.handler_storage) DeferredCallEntry::HandlerFunction;
  971. entry.was_allocated = false;
  972. }
  973. m_pending_deferred_calls = nullptr;
  974. m_free_deferred_call_pool_entry = &m_deferred_call_pool[0];
  975. }
  976. void Processor::deferred_call_return_to_pool(DeferredCallEntry* entry)
  977. {
  978. VERIFY(m_in_critical);
  979. VERIFY(!entry->was_allocated);
  980. entry->handler_value() = {};
  981. entry->next = m_free_deferred_call_pool_entry;
  982. m_free_deferred_call_pool_entry = entry;
  983. }
  984. DeferredCallEntry* Processor::deferred_call_get_free()
  985. {
  986. VERIFY(m_in_critical);
  987. if (m_free_deferred_call_pool_entry) {
  988. // Fast path, we have an entry in our pool
  989. auto* entry = m_free_deferred_call_pool_entry;
  990. m_free_deferred_call_pool_entry = entry->next;
  991. VERIFY(!entry->was_allocated);
  992. return entry;
  993. }
  994. auto* entry = new DeferredCallEntry;
  995. new (entry->handler_storage) DeferredCallEntry::HandlerFunction;
  996. entry->was_allocated = true;
  997. return entry;
  998. }
  999. void Processor::deferred_call_execute_pending()
  1000. {
  1001. VERIFY(m_in_critical);
  1002. if (!m_pending_deferred_calls)
  1003. return;
  1004. auto* pending_list = m_pending_deferred_calls;
  1005. m_pending_deferred_calls = nullptr;
  1006. // We pulled the stack of pending deferred calls in LIFO order, so we need to reverse the list first
  1007. auto reverse_list =
  1008. [](DeferredCallEntry* list) -> DeferredCallEntry* {
  1009. DeferredCallEntry* rev_list = nullptr;
  1010. while (list) {
  1011. auto next = list->next;
  1012. list->next = rev_list;
  1013. rev_list = list;
  1014. list = next;
  1015. }
  1016. return rev_list;
  1017. };
  1018. pending_list = reverse_list(pending_list);
  1019. do {
  1020. pending_list->invoke_handler();
  1021. // Return the entry back to the pool, or free it
  1022. auto* next = pending_list->next;
  1023. if (pending_list->was_allocated) {
  1024. pending_list->handler_value().~Function();
  1025. delete pending_list;
  1026. } else
  1027. deferred_call_return_to_pool(pending_list);
  1028. pending_list = next;
  1029. } while (pending_list);
  1030. }
  1031. void Processor::deferred_call_queue_entry(DeferredCallEntry* entry)
  1032. {
  1033. VERIFY(m_in_critical);
  1034. entry->next = m_pending_deferred_calls;
  1035. m_pending_deferred_calls = entry;
  1036. }
  1037. void Processor::deferred_call_queue(Function<void()> callback)
  1038. {
  1039. // NOTE: If we are called outside of a critical section and outside
  1040. // of an irq handler, the function will be executed before we return!
  1041. ScopedCritical critical;
  1042. auto& cur_proc = Processor::current();
  1043. auto* entry = cur_proc.deferred_call_get_free();
  1044. entry->handler_value() = move(callback);
  1045. cur_proc.deferred_call_queue_entry(entry);
  1046. }
  1047. UNMAP_AFTER_INIT void Processor::gdt_init()
  1048. {
  1049. m_gdt_length = 0;
  1050. m_gdtr.address = nullptr;
  1051. m_gdtr.limit = 0;
  1052. write_raw_gdt_entry(0x0000, 0x00000000, 0x00000000);
  1053. #if ARCH(I386)
  1054. write_raw_gdt_entry(GDT_SELECTOR_CODE0, 0x0000ffff, 0x00cf9a00); // code0
  1055. write_raw_gdt_entry(GDT_SELECTOR_DATA0, 0x0000ffff, 0x00cf9200); // data0
  1056. write_raw_gdt_entry(GDT_SELECTOR_CODE3, 0x0000ffff, 0x00cffa00); // code3
  1057. write_raw_gdt_entry(GDT_SELECTOR_DATA3, 0x0000ffff, 0x00cff200); // data3
  1058. #else
  1059. write_raw_gdt_entry(GDT_SELECTOR_CODE0, 0x0000ffff, 0x00af9a00); // code0
  1060. write_raw_gdt_entry(GDT_SELECTOR_DATA0, 0x0000ffff, 0x00af9200); // data0
  1061. write_raw_gdt_entry(GDT_SELECTOR_DATA3, 0x0000ffff, 0x008ff200); // data3
  1062. write_raw_gdt_entry(GDT_SELECTOR_CODE3, 0x0000ffff, 0x00affa00); // code3
  1063. #endif
  1064. #if ARCH(I386)
  1065. Descriptor tls_descriptor {};
  1066. tls_descriptor.low = tls_descriptor.high = 0;
  1067. tls_descriptor.dpl = 3;
  1068. tls_descriptor.segment_present = 1;
  1069. tls_descriptor.granularity = 0;
  1070. tls_descriptor.operation_size64 = 0;
  1071. tls_descriptor.operation_size32 = 1;
  1072. tls_descriptor.descriptor_type = 1;
  1073. tls_descriptor.type = 2;
  1074. write_gdt_entry(GDT_SELECTOR_TLS, tls_descriptor); // tls3
  1075. Descriptor gs_descriptor {};
  1076. gs_descriptor.set_base(VirtualAddress { this });
  1077. gs_descriptor.set_limit(sizeof(Processor) - 1);
  1078. gs_descriptor.dpl = 0;
  1079. gs_descriptor.segment_present = 1;
  1080. gs_descriptor.granularity = 0;
  1081. gs_descriptor.operation_size64 = 0;
  1082. gs_descriptor.operation_size32 = 1;
  1083. gs_descriptor.descriptor_type = 1;
  1084. gs_descriptor.type = 2;
  1085. write_gdt_entry(GDT_SELECTOR_PROC, gs_descriptor); // gs0
  1086. #endif
  1087. Descriptor tss_descriptor {};
  1088. tss_descriptor.set_base(VirtualAddress { (size_t)&m_tss & 0xffffffff });
  1089. tss_descriptor.set_limit(sizeof(TSS) - 1);
  1090. tss_descriptor.dpl = 0;
  1091. tss_descriptor.segment_present = 1;
  1092. tss_descriptor.granularity = 0;
  1093. tss_descriptor.operation_size64 = 0;
  1094. tss_descriptor.operation_size32 = 1;
  1095. tss_descriptor.descriptor_type = 0;
  1096. tss_descriptor.type = Descriptor::SystemType::AvailableTSS;
  1097. write_gdt_entry(GDT_SELECTOR_TSS, tss_descriptor); // tss
  1098. #if ARCH(X86_64)
  1099. Descriptor tss_descriptor_part2 {};
  1100. tss_descriptor_part2.low = (size_t)&m_tss >> 32;
  1101. write_gdt_entry(GDT_SELECTOR_TSS_PART2, tss_descriptor_part2);
  1102. #endif
  1103. flush_gdt();
  1104. load_task_register(GDT_SELECTOR_TSS);
  1105. #if ARCH(X86_64)
  1106. MSR gs_base(MSR_GS_BASE);
  1107. gs_base.set((u64)this);
  1108. #else
  1109. asm volatile(
  1110. "mov %%ax, %%ds\n"
  1111. "mov %%ax, %%es\n"
  1112. "mov %%ax, %%fs\n"
  1113. "mov %%ax, %%ss\n" ::"a"(GDT_SELECTOR_DATA0)
  1114. : "memory");
  1115. set_gs(GDT_SELECTOR_PROC);
  1116. #endif
  1117. #if ARCH(I386)
  1118. // Make sure CS points to the kernel code descriptor.
  1119. // clang-format off
  1120. asm volatile(
  1121. "ljmpl $" __STRINGIFY(GDT_SELECTOR_CODE0) ", $sanity\n"
  1122. "sanity:\n");
  1123. // clang-format on
  1124. #endif
  1125. }
  1126. extern "C" void context_first_init([[maybe_unused]] Thread* from_thread, [[maybe_unused]] Thread* to_thread, [[maybe_unused]] TrapFrame* trap)
  1127. {
  1128. VERIFY(!are_interrupts_enabled());
  1129. VERIFY(is_kernel_mode());
  1130. dbgln_if(CONTEXT_SWITCH_DEBUG, "switch_context <-- from {} {} to {} {} (context_first_init)", VirtualAddress(from_thread), *from_thread, VirtualAddress(to_thread), *to_thread);
  1131. VERIFY(to_thread == Thread::current());
  1132. Scheduler::enter_current(*from_thread);
  1133. auto in_critical = to_thread->saved_critical();
  1134. VERIFY(in_critical > 0);
  1135. Processor::restore_in_critical(in_critical);
  1136. // Since we got here and don't have Scheduler::context_switch in the
  1137. // call stack (because this is the first time we switched into this
  1138. // context), we need to notify the scheduler so that it can release
  1139. // the scheduler lock. We don't want to enable interrupts at this point
  1140. // as we're still in the middle of a context switch. Doing so could
  1141. // trigger a context switch within a context switch, leading to a crash.
  1142. FlatPtr flags = trap->regs->flags();
  1143. Scheduler::leave_on_first_switch(flags & ~0x200);
  1144. }
  1145. extern "C" void enter_thread_context(Thread* from_thread, Thread* to_thread)
  1146. {
  1147. VERIFY(from_thread == to_thread || from_thread->state() != Thread::State::Running);
  1148. VERIFY(to_thread->state() == Thread::State::Running);
  1149. bool has_fxsr = Processor::current().has_feature(CPUFeature::FXSR);
  1150. Processor::set_current_thread(*to_thread);
  1151. auto& from_regs = from_thread->regs();
  1152. auto& to_regs = to_thread->regs();
  1153. // NOTE: IOPL should never be non-zero in any situation, so let's panic immediately
  1154. // instead of carrying on with elevated I/O privileges.
  1155. VERIFY(get_iopl_from_eflags(to_regs.flags()) == 0);
  1156. if (has_fxsr)
  1157. asm volatile("fxsave %0"
  1158. : "=m"(from_thread->fpu_state()));
  1159. else
  1160. asm volatile("fnsave %0"
  1161. : "=m"(from_thread->fpu_state()));
  1162. #if ARCH(I386)
  1163. from_regs.fs = get_fs();
  1164. from_regs.gs = get_gs();
  1165. set_fs(to_regs.fs);
  1166. set_gs(to_regs.gs);
  1167. #endif
  1168. if (from_thread->process().is_traced())
  1169. read_debug_registers_into(from_thread->debug_register_state());
  1170. if (to_thread->process().is_traced()) {
  1171. write_debug_registers_from(to_thread->debug_register_state());
  1172. } else {
  1173. clear_debug_registers();
  1174. }
  1175. auto& processor = Processor::current();
  1176. #if ARCH(I386)
  1177. auto& tls_descriptor = processor.get_gdt_entry(GDT_SELECTOR_TLS);
  1178. tls_descriptor.set_base(to_thread->thread_specific_data());
  1179. tls_descriptor.set_limit(to_thread->thread_specific_region_size());
  1180. #else
  1181. MSR fs_base_msr(MSR_FS_BASE);
  1182. fs_base_msr.set(to_thread->thread_specific_data().get());
  1183. #endif
  1184. if (from_regs.cr3 != to_regs.cr3)
  1185. write_cr3(to_regs.cr3);
  1186. to_thread->set_cpu(processor.id());
  1187. auto in_critical = to_thread->saved_critical();
  1188. VERIFY(in_critical > 0);
  1189. Processor::restore_in_critical(in_critical);
  1190. if (has_fxsr)
  1191. asm volatile("fxrstor %0" ::"m"(to_thread->fpu_state()));
  1192. else
  1193. asm volatile("frstor %0" ::"m"(to_thread->fpu_state()));
  1194. }
  1195. extern "C" FlatPtr do_init_context(Thread* thread, u32 flags)
  1196. {
  1197. VERIFY_INTERRUPTS_DISABLED();
  1198. thread->regs().set_flags(flags);
  1199. return Processor::current().init_context(*thread, true);
  1200. }
  1201. void Processor::assume_context(Thread& thread, FlatPtr flags)
  1202. {
  1203. dbgln_if(CONTEXT_SWITCH_DEBUG, "Assume context for thread {} {}", VirtualAddress(&thread), thread);
  1204. VERIFY_INTERRUPTS_DISABLED();
  1205. Scheduler::prepare_after_exec();
  1206. // in_critical() should be 2 here. The critical section in Process::exec
  1207. // and then the scheduler lock
  1208. VERIFY(Processor::in_critical() == 2);
  1209. do_assume_context(&thread, flags);
  1210. VERIFY_NOT_REACHED();
  1211. }
  1212. u64 Processor::time_spent_idle() const
  1213. {
  1214. return m_idle_thread->time_in_user() + m_idle_thread->time_in_kernel();
  1215. }
  1216. }