Processor.cpp 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. /*
  2. * Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Format.h>
  7. #include <AK/StdLibExtras.h>
  8. #include <AK/String.h>
  9. #include <AK/Types.h>
  10. #include <Kernel/Interrupts/APIC.h>
  11. #include <Kernel/Process.h>
  12. #include <Kernel/Random.h>
  13. #include <Kernel/Sections.h>
  14. #include <Kernel/StdLib.h>
  15. #include <Kernel/Thread.h>
  16. #include <Kernel/VM/ProcessPagingScope.h>
  17. #include <Kernel/Arch/x86/CPUID.h>
  18. #include <Kernel/Arch/x86/Interrupts.h>
  19. #include <Kernel/Arch/x86/Processor.h>
  20. #include <Kernel/Arch/x86/ProcessorInfo.h>
  21. #include <Kernel/Arch/x86/SafeMem.h>
  22. #include <Kernel/Arch/x86/ScopedCritical.h>
  23. #include <Kernel/Arch/x86/TrapFrame.h>
  24. namespace Kernel {
  25. READONLY_AFTER_INIT FPUState Processor::s_clean_fpu_state;
  26. READONLY_AFTER_INIT static ProcessorContainer s_processors {};
  27. READONLY_AFTER_INIT volatile u32 Processor::g_total_processors;
  28. static volatile bool s_smp_enabled;
  29. static volatile ProcessorMessage* s_message_pool;
  30. Atomic<u32> Processor::s_idle_cpu_mask { 0 };
  31. extern "C" void thread_context_first_enter(void);
  32. extern "C" void exit_kernel_thread(void);
  33. UNMAP_AFTER_INIT static void sse_init()
  34. {
  35. write_cr0((read_cr0() & 0xfffffffbu) | 0x2);
  36. write_cr4(read_cr4() | 0x600);
  37. }
  38. void exit_kernel_thread(void)
  39. {
  40. Thread::current()->exit();
  41. }
  42. UNMAP_AFTER_INIT void Processor::cpu_detect()
  43. {
  44. // NOTE: This is called during Processor::early_initialize, we cannot
  45. // safely log at this point because we don't have kmalloc
  46. // initialized yet!
  47. auto set_feature =
  48. [&](CPUFeature f) {
  49. m_features = static_cast<CPUFeature>(static_cast<u32>(m_features) | static_cast<u32>(f));
  50. };
  51. m_features = static_cast<CPUFeature>(0);
  52. CPUID processor_info(0x1);
  53. if (processor_info.edx() & (1 << 4))
  54. set_feature(CPUFeature::TSC);
  55. if (processor_info.edx() & (1 << 6))
  56. set_feature(CPUFeature::PAE);
  57. if (processor_info.edx() & (1 << 13))
  58. set_feature(CPUFeature::PGE);
  59. if (processor_info.edx() & (1 << 23))
  60. set_feature(CPUFeature::MMX);
  61. if (processor_info.edx() & (1 << 24))
  62. set_feature(CPUFeature::FXSR);
  63. if (processor_info.edx() & (1 << 25))
  64. set_feature(CPUFeature::SSE);
  65. if (processor_info.edx() & (1 << 26))
  66. set_feature(CPUFeature::SSE2);
  67. if (processor_info.ecx() & (1 << 0))
  68. set_feature(CPUFeature::SSE3);
  69. if (processor_info.ecx() & (1 << 9))
  70. set_feature(CPUFeature::SSSE3);
  71. if (processor_info.ecx() & (1 << 19))
  72. set_feature(CPUFeature::SSE4_1);
  73. if (processor_info.ecx() & (1 << 20))
  74. set_feature(CPUFeature::SSE4_2);
  75. if (processor_info.ecx() & (1 << 26))
  76. set_feature(CPUFeature::XSAVE);
  77. if (processor_info.ecx() & (1 << 28))
  78. set_feature(CPUFeature::AVX);
  79. if (processor_info.ecx() & (1 << 30))
  80. set_feature(CPUFeature::RDRAND);
  81. if (processor_info.edx() & (1 << 11)) {
  82. u32 stepping = processor_info.eax() & 0xf;
  83. u32 model = (processor_info.eax() >> 4) & 0xf;
  84. u32 family = (processor_info.eax() >> 8) & 0xf;
  85. if (!(family == 6 && model < 3 && stepping < 3))
  86. set_feature(CPUFeature::SEP);
  87. if ((family == 6 && model >= 3) || (family == 0xf && model >= 0xe))
  88. set_feature(CPUFeature::CONSTANT_TSC);
  89. }
  90. u32 max_extended_leaf = CPUID(0x80000000).eax();
  91. if (max_extended_leaf >= 0x80000001) {
  92. CPUID extended_processor_info(0x80000001);
  93. if (extended_processor_info.edx() & (1 << 20))
  94. set_feature(CPUFeature::NX);
  95. if (extended_processor_info.edx() & (1 << 27))
  96. set_feature(CPUFeature::RDTSCP);
  97. if (extended_processor_info.edx() & (1 << 11)) {
  98. // Only available in 64 bit mode
  99. set_feature(CPUFeature::SYSCALL);
  100. }
  101. }
  102. if (max_extended_leaf >= 0x80000007) {
  103. CPUID cpuid(0x80000007);
  104. if (cpuid.edx() & (1 << 8)) {
  105. set_feature(CPUFeature::CONSTANT_TSC);
  106. set_feature(CPUFeature::NONSTOP_TSC);
  107. }
  108. }
  109. if (max_extended_leaf >= 0x80000008) {
  110. // CPUID.80000008H:EAX[7:0] reports the physical-address width supported by the processor.
  111. CPUID cpuid(0x80000008);
  112. m_physical_address_bit_width = cpuid.eax() & 0xff;
  113. } else {
  114. // For processors that do not support CPUID function 80000008H, the width is generally 36 if CPUID.01H:EDX.PAE [bit 6] = 1 and 32 otherwise.
  115. m_physical_address_bit_width = has_feature(CPUFeature::PAE) ? 36 : 32;
  116. }
  117. CPUID extended_features(0x7);
  118. if (extended_features.ebx() & (1 << 20))
  119. set_feature(CPUFeature::SMAP);
  120. if (extended_features.ebx() & (1 << 7))
  121. set_feature(CPUFeature::SMEP);
  122. if (extended_features.ecx() & (1 << 2))
  123. set_feature(CPUFeature::UMIP);
  124. if (extended_features.ebx() & (1 << 18))
  125. set_feature(CPUFeature::RDSEED);
  126. }
  127. UNMAP_AFTER_INIT void Processor::cpu_setup()
  128. {
  129. // NOTE: This is called during Processor::early_initialize, we cannot
  130. // safely log at this point because we don't have kmalloc
  131. // initialized yet!
  132. cpu_detect();
  133. if (has_feature(CPUFeature::SSE)) {
  134. // enter_thread_context() assumes that if a x86 CPU supports SSE then it also supports FXSR.
  135. // SSE support without FXSR is an extremely unlikely scenario, so let's be pragmatic about it.
  136. VERIFY(has_feature(CPUFeature::FXSR));
  137. sse_init();
  138. }
  139. write_cr0(read_cr0() | 0x00010000);
  140. if (has_feature(CPUFeature::PGE)) {
  141. // Turn on CR4.PGE so the CPU will respect the G bit in page tables.
  142. write_cr4(read_cr4() | 0x80);
  143. }
  144. if (has_feature(CPUFeature::NX)) {
  145. // Turn on IA32_EFER.NXE
  146. asm volatile(
  147. "movl $0xc0000080, %ecx\n"
  148. "rdmsr\n"
  149. "orl $0x800, %eax\n"
  150. "wrmsr\n");
  151. }
  152. if (has_feature(CPUFeature::SMEP)) {
  153. // Turn on CR4.SMEP
  154. write_cr4(read_cr4() | 0x100000);
  155. }
  156. if (has_feature(CPUFeature::SMAP)) {
  157. // Turn on CR4.SMAP
  158. write_cr4(read_cr4() | 0x200000);
  159. }
  160. if (has_feature(CPUFeature::UMIP)) {
  161. write_cr4(read_cr4() | 0x800);
  162. }
  163. if (has_feature(CPUFeature::TSC)) {
  164. write_cr4(read_cr4() | 0x4);
  165. }
  166. if (has_feature(CPUFeature::XSAVE)) {
  167. // Turn on CR4.OSXSAVE
  168. write_cr4(read_cr4() | 0x40000);
  169. // According to the Intel manual: "After reset, all bits (except bit 0) in XCR0 are cleared to zero; XCR0[0] is set to 1."
  170. // Sadly we can't trust this, for example VirtualBox starts with bits 0-4 set, so let's do it ourselves.
  171. write_xcr0(0x1);
  172. if (has_feature(CPUFeature::AVX)) {
  173. // Turn on SSE, AVX and x87 flags
  174. write_xcr0(read_xcr0() | 0x7);
  175. }
  176. }
  177. }
  178. String Processor::features_string() const
  179. {
  180. StringBuilder builder;
  181. auto feature_to_str =
  182. [](CPUFeature f) -> const char* {
  183. switch (f) {
  184. case CPUFeature::NX:
  185. return "nx";
  186. case CPUFeature::PAE:
  187. return "pae";
  188. case CPUFeature::PGE:
  189. return "pge";
  190. case CPUFeature::RDRAND:
  191. return "rdrand";
  192. case CPUFeature::RDSEED:
  193. return "rdseed";
  194. case CPUFeature::SMAP:
  195. return "smap";
  196. case CPUFeature::SMEP:
  197. return "smep";
  198. case CPUFeature::SSE:
  199. return "sse";
  200. case CPUFeature::TSC:
  201. return "tsc";
  202. case CPUFeature::RDTSCP:
  203. return "rdtscp";
  204. case CPUFeature::CONSTANT_TSC:
  205. return "constant_tsc";
  206. case CPUFeature::NONSTOP_TSC:
  207. return "nonstop_tsc";
  208. case CPUFeature::UMIP:
  209. return "umip";
  210. case CPUFeature::SEP:
  211. return "sep";
  212. case CPUFeature::SYSCALL:
  213. return "syscall";
  214. case CPUFeature::MMX:
  215. return "mmx";
  216. case CPUFeature::FXSR:
  217. return "fxsr";
  218. case CPUFeature::SSE2:
  219. return "sse2";
  220. case CPUFeature::SSE3:
  221. return "sse3";
  222. case CPUFeature::SSSE3:
  223. return "ssse3";
  224. case CPUFeature::SSE4_1:
  225. return "sse4.1";
  226. case CPUFeature::SSE4_2:
  227. return "sse4.2";
  228. case CPUFeature::XSAVE:
  229. return "xsave";
  230. case CPUFeature::AVX:
  231. return "avx";
  232. // no default statement here intentionally so that we get
  233. // a warning if a new feature is forgotten to be added here
  234. }
  235. // Shouldn't ever happen
  236. return "???";
  237. };
  238. bool first = true;
  239. for (u32 flag = 1; flag != 0; flag <<= 1) {
  240. if ((static_cast<u32>(m_features) & flag) != 0) {
  241. if (first)
  242. first = false;
  243. else
  244. builder.append(' ');
  245. auto str = feature_to_str(static_cast<CPUFeature>(flag));
  246. builder.append(str, strlen(str));
  247. }
  248. }
  249. return builder.build();
  250. }
  251. UNMAP_AFTER_INIT void Processor::early_initialize(u32 cpu)
  252. {
  253. m_self = this;
  254. m_cpu = cpu;
  255. m_in_irq = 0;
  256. m_in_critical = 0;
  257. m_invoke_scheduler_async = false;
  258. m_scheduler_initialized = false;
  259. m_message_queue = nullptr;
  260. m_idle_thread = nullptr;
  261. m_current_thread = nullptr;
  262. m_scheduler_data = nullptr;
  263. m_mm_data = nullptr;
  264. m_info = nullptr;
  265. m_halt_requested = false;
  266. if (cpu == 0) {
  267. s_smp_enabled = false;
  268. atomic_store(&g_total_processors, 1u, AK::MemoryOrder::memory_order_release);
  269. } else {
  270. atomic_fetch_add(&g_total_processors, 1u, AK::MemoryOrder::memory_order_acq_rel);
  271. }
  272. deferred_call_pool_init();
  273. cpu_setup();
  274. gdt_init();
  275. VERIFY(is_initialized()); // sanity check
  276. VERIFY(&current() == this); // sanity check
  277. }
  278. UNMAP_AFTER_INIT void Processor::initialize(u32 cpu)
  279. {
  280. VERIFY(m_self == this);
  281. VERIFY(&current() == this); // sanity check
  282. dmesgln("CPU[{}]: Supported features: {}", id(), features_string());
  283. if (!has_feature(CPUFeature::RDRAND))
  284. dmesgln("CPU[{}]: No RDRAND support detected, randomness will be poor", id());
  285. dmesgln("CPU[{}]: Physical address bit width: {}", id(), m_physical_address_bit_width);
  286. if (cpu == 0)
  287. idt_init();
  288. else
  289. flush_idt();
  290. if (cpu == 0) {
  291. VERIFY((FlatPtr(&s_clean_fpu_state) & 0xF) == 0);
  292. asm volatile("fninit");
  293. if (has_feature(CPUFeature::FXSR))
  294. asm volatile("fxsave %0"
  295. : "=m"(s_clean_fpu_state));
  296. else
  297. asm volatile("fnsave %0"
  298. : "=m"(s_clean_fpu_state));
  299. }
  300. m_info = new ProcessorInfo(*this);
  301. {
  302. // We need to prevent races between APs starting up at the same time
  303. VERIFY(cpu < s_processors.size());
  304. s_processors[cpu] = this;
  305. }
  306. }
  307. void Processor::write_raw_gdt_entry(u16 selector, u32 low, u32 high)
  308. {
  309. u16 i = (selector & 0xfffc) >> 3;
  310. u32 prev_gdt_length = m_gdt_length;
  311. if (i > m_gdt_length) {
  312. m_gdt_length = i + 1;
  313. VERIFY(m_gdt_length <= sizeof(m_gdt) / sizeof(m_gdt[0]));
  314. m_gdtr.limit = (m_gdt_length + 1) * 8 - 1;
  315. }
  316. m_gdt[i].low = low;
  317. m_gdt[i].high = high;
  318. // clear selectors we may have skipped
  319. while (i < prev_gdt_length) {
  320. m_gdt[i].low = 0;
  321. m_gdt[i].high = 0;
  322. i++;
  323. }
  324. }
  325. void Processor::write_gdt_entry(u16 selector, Descriptor& descriptor)
  326. {
  327. write_raw_gdt_entry(selector, descriptor.low, descriptor.high);
  328. }
  329. Descriptor& Processor::get_gdt_entry(u16 selector)
  330. {
  331. u16 i = (selector & 0xfffc) >> 3;
  332. return *(Descriptor*)(&m_gdt[i]);
  333. }
  334. void Processor::flush_gdt()
  335. {
  336. m_gdtr.address = m_gdt;
  337. m_gdtr.limit = (m_gdt_length * 8) - 1;
  338. asm volatile("lgdt %0" ::"m"(m_gdtr)
  339. : "memory");
  340. }
  341. const DescriptorTablePointer& Processor::get_gdtr()
  342. {
  343. return m_gdtr;
  344. }
  345. Vector<FlatPtr> Processor::capture_stack_trace(Thread& thread, size_t max_frames)
  346. {
  347. FlatPtr frame_ptr = 0, eip = 0;
  348. Vector<FlatPtr, 32> stack_trace;
  349. auto walk_stack = [&](FlatPtr stack_ptr) {
  350. static constexpr size_t max_stack_frames = 4096;
  351. stack_trace.append(eip);
  352. size_t count = 1;
  353. while (stack_ptr && stack_trace.size() < max_stack_frames) {
  354. FlatPtr retaddr;
  355. count++;
  356. if (max_frames != 0 && count > max_frames)
  357. break;
  358. if (is_user_range(VirtualAddress(stack_ptr), sizeof(FlatPtr) * 2)) {
  359. if (!copy_from_user(&retaddr, &((FlatPtr*)stack_ptr)[1]) || !retaddr)
  360. break;
  361. stack_trace.append(retaddr);
  362. if (!copy_from_user(&stack_ptr, (FlatPtr*)stack_ptr))
  363. break;
  364. } else {
  365. void* fault_at;
  366. if (!safe_memcpy(&retaddr, &((FlatPtr*)stack_ptr)[1], sizeof(FlatPtr), fault_at) || !retaddr)
  367. break;
  368. stack_trace.append(retaddr);
  369. if (!safe_memcpy(&stack_ptr, (FlatPtr*)stack_ptr, sizeof(FlatPtr), fault_at))
  370. break;
  371. }
  372. }
  373. };
  374. auto capture_current_thread = [&]() {
  375. frame_ptr = (FlatPtr)__builtin_frame_address(0);
  376. eip = (FlatPtr)__builtin_return_address(0);
  377. walk_stack(frame_ptr);
  378. };
  379. // Since the thread may be running on another processor, there
  380. // is a chance a context switch may happen while we're trying
  381. // to get it. It also won't be entirely accurate and merely
  382. // reflect the status at the last context switch.
  383. ScopedSpinLock lock(g_scheduler_lock);
  384. if (&thread == Processor::current_thread()) {
  385. VERIFY(thread.state() == Thread::Running);
  386. // Leave the scheduler lock. If we trigger page faults we may
  387. // need to be preempted. Since this is our own thread it won't
  388. // cause any problems as the stack won't change below this frame.
  389. lock.unlock();
  390. capture_current_thread();
  391. } else if (thread.is_active()) {
  392. VERIFY(thread.cpu() != Processor::id());
  393. // If this is the case, the thread is currently running
  394. // on another processor. We can't trust the kernel stack as
  395. // it may be changing at any time. We need to probably send
  396. // an IPI to that processor, have it walk the stack and wait
  397. // until it returns the data back to us
  398. auto& proc = Processor::current();
  399. smp_unicast(
  400. thread.cpu(),
  401. [&]() {
  402. dbgln("CPU[{}] getting stack for cpu #{}", Processor::id(), proc.get_id());
  403. ProcessPagingScope paging_scope(thread.process());
  404. VERIFY(&Processor::current() != &proc);
  405. VERIFY(&thread == Processor::current_thread());
  406. // NOTE: Because the other processor is still holding the
  407. // scheduler lock while waiting for this callback to finish,
  408. // the current thread on the target processor cannot change
  409. // TODO: What to do about page faults here? We might deadlock
  410. // because the other processor is still holding the
  411. // scheduler lock...
  412. capture_current_thread();
  413. },
  414. false);
  415. } else {
  416. switch (thread.state()) {
  417. case Thread::Running:
  418. VERIFY_NOT_REACHED(); // should have been handled above
  419. case Thread::Runnable:
  420. case Thread::Stopped:
  421. case Thread::Blocked:
  422. case Thread::Dying:
  423. case Thread::Dead: {
  424. // We need to retrieve ebp from what was last pushed to the kernel
  425. // stack. Before switching out of that thread, it switch_context
  426. // pushed the callee-saved registers, and the last of them happens
  427. // to be ebp.
  428. ProcessPagingScope paging_scope(thread.process());
  429. auto& tss = thread.tss();
  430. u32* stack_top;
  431. #if ARCH(I386)
  432. stack_top = reinterpret_cast<u32*>(tss.esp);
  433. #else
  434. (void)tss;
  435. TODO();
  436. #endif
  437. if (is_user_range(VirtualAddress(stack_top), sizeof(FlatPtr))) {
  438. if (!copy_from_user(&frame_ptr, &((FlatPtr*)stack_top)[0]))
  439. frame_ptr = 0;
  440. } else {
  441. void* fault_at;
  442. if (!safe_memcpy(&frame_ptr, &((FlatPtr*)stack_top)[0], sizeof(FlatPtr), fault_at))
  443. frame_ptr = 0;
  444. }
  445. #if ARCH(I386)
  446. eip = tss.eip;
  447. #else
  448. TODO();
  449. #endif
  450. // TODO: We need to leave the scheduler lock here, but we also
  451. // need to prevent the target thread from being run while
  452. // we walk the stack
  453. lock.unlock();
  454. walk_stack(frame_ptr);
  455. break;
  456. }
  457. default:
  458. dbgln("Cannot capture stack trace for thread {} in state {}", thread, thread.state_string());
  459. break;
  460. }
  461. }
  462. return stack_trace;
  463. }
  464. ProcessorContainer& Processor::processors()
  465. {
  466. return s_processors;
  467. }
  468. Processor& Processor::by_id(u32 cpu)
  469. {
  470. // s_processors does not need to be protected by a lock of any kind.
  471. // It is populated early in the boot process, and the BSP is waiting
  472. // for all APs to finish, after which this array never gets modified
  473. // again, so it's safe to not protect access to it here
  474. auto& procs = processors();
  475. VERIFY(procs[cpu] != nullptr);
  476. VERIFY(procs.size() > cpu);
  477. return *procs[cpu];
  478. }
  479. void Processor::enter_trap(TrapFrame& trap, bool raise_irq)
  480. {
  481. VERIFY_INTERRUPTS_DISABLED();
  482. VERIFY(&Processor::current() == this);
  483. trap.prev_irq_level = m_in_irq;
  484. if (raise_irq)
  485. m_in_irq++;
  486. auto* current_thread = Processor::current_thread();
  487. if (current_thread) {
  488. auto& current_trap = current_thread->current_trap();
  489. trap.next_trap = current_trap;
  490. current_trap = &trap;
  491. // The cs register of this trap tells us where we will return back to
  492. current_thread->set_previous_mode(((trap.regs->cs & 3) != 0) ? Thread::PreviousMode::UserMode : Thread::PreviousMode::KernelMode);
  493. } else {
  494. trap.next_trap = nullptr;
  495. }
  496. }
  497. void Processor::exit_trap(TrapFrame& trap)
  498. {
  499. VERIFY_INTERRUPTS_DISABLED();
  500. VERIFY(&Processor::current() == this);
  501. VERIFY(m_in_irq >= trap.prev_irq_level);
  502. m_in_irq = trap.prev_irq_level;
  503. smp_process_pending_messages();
  504. if (!m_in_irq && !m_in_critical)
  505. check_invoke_scheduler();
  506. auto* current_thread = Processor::current_thread();
  507. if (current_thread) {
  508. auto& current_trap = current_thread->current_trap();
  509. current_trap = trap.next_trap;
  510. if (current_trap) {
  511. VERIFY(current_trap->regs);
  512. // If we have another higher level trap then we probably returned
  513. // from an interrupt or irq handler. The cs register of the
  514. // new/higher level trap tells us what the mode prior to it was
  515. current_thread->set_previous_mode(((current_trap->regs->cs & 3) != 0) ? Thread::PreviousMode::UserMode : Thread::PreviousMode::KernelMode);
  516. } else {
  517. // If we don't have a higher level trap then we're back in user mode.
  518. // Unless we're a kernel process, in which case we're always in kernel mode
  519. current_thread->set_previous_mode(current_thread->process().is_kernel_process() ? Thread::PreviousMode::KernelMode : Thread::PreviousMode::UserMode);
  520. }
  521. }
  522. }
  523. void Processor::check_invoke_scheduler()
  524. {
  525. VERIFY(!m_in_irq);
  526. VERIFY(!m_in_critical);
  527. if (m_invoke_scheduler_async && m_scheduler_initialized) {
  528. m_invoke_scheduler_async = false;
  529. Scheduler::invoke_async();
  530. }
  531. }
  532. void Processor::flush_tlb_local(VirtualAddress vaddr, size_t page_count)
  533. {
  534. auto ptr = vaddr.as_ptr();
  535. while (page_count > 0) {
  536. // clang-format off
  537. asm volatile("invlpg %0"
  538. :
  539. : "m"(*ptr)
  540. : "memory");
  541. // clang-format on
  542. ptr += PAGE_SIZE;
  543. page_count--;
  544. }
  545. }
  546. void Processor::flush_tlb(const PageDirectory* page_directory, VirtualAddress vaddr, size_t page_count)
  547. {
  548. if (s_smp_enabled && (!is_user_address(vaddr) || Process::current()->thread_count() > 1))
  549. smp_broadcast_flush_tlb(page_directory, vaddr, page_count);
  550. else
  551. flush_tlb_local(vaddr, page_count);
  552. }
  553. void Processor::smp_return_to_pool(ProcessorMessage& msg)
  554. {
  555. ProcessorMessage* next = nullptr;
  556. do {
  557. msg.next = next;
  558. } while (!atomic_compare_exchange_strong(&s_message_pool, next, &msg, AK::MemoryOrder::memory_order_acq_rel));
  559. }
  560. ProcessorMessage& Processor::smp_get_from_pool()
  561. {
  562. ProcessorMessage* msg;
  563. // The assumption is that messages are never removed from the pool!
  564. for (;;) {
  565. msg = atomic_load(&s_message_pool, AK::MemoryOrder::memory_order_consume);
  566. if (!msg) {
  567. if (!Processor::current().smp_process_pending_messages()) {
  568. // TODO: pause for a bit?
  569. }
  570. continue;
  571. }
  572. // If another processor were to use this message in the meanwhile,
  573. // "msg" is still valid (because it never gets freed). We'd detect
  574. // this because the expected value "msg" and pool would
  575. // no longer match, and the compare_exchange will fail. But accessing
  576. // "msg->next" is always safe here.
  577. if (atomic_compare_exchange_strong(&s_message_pool, msg, msg->next, AK::MemoryOrder::memory_order_acq_rel)) {
  578. // We successfully "popped" this available message
  579. break;
  580. }
  581. }
  582. VERIFY(msg != nullptr);
  583. return *msg;
  584. }
  585. u32 Processor::smp_wake_n_idle_processors(u32 wake_count)
  586. {
  587. VERIFY(Processor::current().in_critical());
  588. VERIFY(wake_count > 0);
  589. if (!s_smp_enabled)
  590. return 0;
  591. // Wake at most N - 1 processors
  592. if (wake_count >= Processor::count()) {
  593. wake_count = Processor::count() - 1;
  594. VERIFY(wake_count > 0);
  595. }
  596. u32 current_id = Processor::current().id();
  597. u32 did_wake_count = 0;
  598. auto& apic = APIC::the();
  599. while (did_wake_count < wake_count) {
  600. // Try to get a set of idle CPUs and flip them to busy
  601. u32 idle_mask = s_idle_cpu_mask.load(AK::MemoryOrder::memory_order_relaxed) & ~(1u << current_id);
  602. u32 idle_count = __builtin_popcountl(idle_mask);
  603. if (idle_count == 0)
  604. break; // No (more) idle processor available
  605. u32 found_mask = 0;
  606. for (u32 i = 0; i < idle_count; i++) {
  607. u32 cpu = __builtin_ffsl(idle_mask) - 1;
  608. idle_mask &= ~(1u << cpu);
  609. found_mask |= 1u << cpu;
  610. }
  611. idle_mask = s_idle_cpu_mask.fetch_and(~found_mask, AK::MemoryOrder::memory_order_acq_rel) & found_mask;
  612. if (idle_mask == 0)
  613. continue; // All of them were flipped to busy, try again
  614. idle_count = __builtin_popcountl(idle_mask);
  615. for (u32 i = 0; i < idle_count; i++) {
  616. u32 cpu = __builtin_ffsl(idle_mask) - 1;
  617. idle_mask &= ~(1u << cpu);
  618. // Send an IPI to that CPU to wake it up. There is a possibility
  619. // someone else woke it up as well, or that it woke up due to
  620. // a timer interrupt. But we tried hard to avoid this...
  621. apic.send_ipi(cpu);
  622. did_wake_count++;
  623. }
  624. }
  625. return did_wake_count;
  626. }
  627. UNMAP_AFTER_INIT void Processor::smp_enable()
  628. {
  629. size_t msg_pool_size = Processor::count() * 100u;
  630. size_t msg_entries_cnt = Processor::count();
  631. auto msgs = new ProcessorMessage[msg_pool_size];
  632. auto msg_entries = new ProcessorMessageEntry[msg_pool_size * msg_entries_cnt];
  633. size_t msg_entry_i = 0;
  634. for (size_t i = 0; i < msg_pool_size; i++, msg_entry_i += msg_entries_cnt) {
  635. auto& msg = msgs[i];
  636. msg.next = i < msg_pool_size - 1 ? &msgs[i + 1] : nullptr;
  637. msg.per_proc_entries = &msg_entries[msg_entry_i];
  638. for (size_t k = 0; k < msg_entries_cnt; k++)
  639. msg_entries[msg_entry_i + k].msg = &msg;
  640. }
  641. atomic_store(&s_message_pool, &msgs[0], AK::MemoryOrder::memory_order_release);
  642. // Start sending IPI messages
  643. s_smp_enabled = true;
  644. }
  645. void Processor::smp_cleanup_message(ProcessorMessage& msg)
  646. {
  647. switch (msg.type) {
  648. case ProcessorMessage::Callback:
  649. msg.callback_value().~Function();
  650. break;
  651. default:
  652. break;
  653. }
  654. }
  655. bool Processor::smp_process_pending_messages()
  656. {
  657. bool did_process = false;
  658. u32 prev_flags;
  659. enter_critical(prev_flags);
  660. if (auto pending_msgs = atomic_exchange(&m_message_queue, nullptr, AK::MemoryOrder::memory_order_acq_rel)) {
  661. // We pulled the stack of pending messages in LIFO order, so we need to reverse the list first
  662. auto reverse_list =
  663. [](ProcessorMessageEntry* list) -> ProcessorMessageEntry* {
  664. ProcessorMessageEntry* rev_list = nullptr;
  665. while (list) {
  666. auto next = list->next;
  667. list->next = rev_list;
  668. rev_list = list;
  669. list = next;
  670. }
  671. return rev_list;
  672. };
  673. pending_msgs = reverse_list(pending_msgs);
  674. // now process in the right order
  675. ProcessorMessageEntry* next_msg;
  676. for (auto cur_msg = pending_msgs; cur_msg; cur_msg = next_msg) {
  677. next_msg = cur_msg->next;
  678. auto msg = cur_msg->msg;
  679. dbgln_if(SMP_DEBUG, "SMP[{}]: Processing message {}", id(), VirtualAddress(msg));
  680. switch (msg->type) {
  681. case ProcessorMessage::Callback:
  682. msg->invoke_callback();
  683. break;
  684. case ProcessorMessage::FlushTlb:
  685. if (is_user_address(VirtualAddress(msg->flush_tlb.ptr))) {
  686. // We assume that we don't cross into kernel land!
  687. VERIFY(is_user_range(VirtualAddress(msg->flush_tlb.ptr), msg->flush_tlb.page_count * PAGE_SIZE));
  688. if (read_cr3() != msg->flush_tlb.page_directory->cr3()) {
  689. // This processor isn't using this page directory right now, we can ignore this request
  690. dbgln_if(SMP_DEBUG, "SMP[{}]: No need to flush {} pages at {}", id(), msg->flush_tlb.page_count, VirtualAddress(msg->flush_tlb.ptr));
  691. break;
  692. }
  693. }
  694. flush_tlb_local(VirtualAddress(msg->flush_tlb.ptr), msg->flush_tlb.page_count);
  695. break;
  696. }
  697. bool is_async = msg->async; // Need to cache this value *before* dropping the ref count!
  698. auto prev_refs = atomic_fetch_sub(&msg->refs, 1u, AK::MemoryOrder::memory_order_acq_rel);
  699. VERIFY(prev_refs != 0);
  700. if (prev_refs == 1) {
  701. // All processors handled this. If this is an async message,
  702. // we need to clean it up and return it to the pool
  703. if (is_async) {
  704. smp_cleanup_message(*msg);
  705. smp_return_to_pool(*msg);
  706. }
  707. }
  708. if (m_halt_requested.load(AK::MemoryOrder::memory_order_relaxed))
  709. halt_this();
  710. }
  711. did_process = true;
  712. } else if (m_halt_requested.load(AK::MemoryOrder::memory_order_relaxed)) {
  713. halt_this();
  714. }
  715. leave_critical(prev_flags);
  716. return did_process;
  717. }
  718. bool Processor::smp_queue_message(ProcessorMessage& msg)
  719. {
  720. // Note that it's quite possible that the other processor may pop
  721. // the queue at any given time. We rely on the fact that the messages
  722. // are pooled and never get freed!
  723. auto& msg_entry = msg.per_proc_entries[id()];
  724. VERIFY(msg_entry.msg == &msg);
  725. ProcessorMessageEntry* next = nullptr;
  726. do {
  727. msg_entry.next = next;
  728. } while (!atomic_compare_exchange_strong(&m_message_queue, next, &msg_entry, AK::MemoryOrder::memory_order_acq_rel));
  729. return next == nullptr;
  730. }
  731. void Processor::smp_broadcast_message(ProcessorMessage& msg)
  732. {
  733. auto& cur_proc = Processor::current();
  734. dbgln_if(SMP_DEBUG, "SMP[{}]: Broadcast message {} to cpus: {} proc: {}", cur_proc.get_id(), VirtualAddress(&msg), count(), VirtualAddress(&cur_proc));
  735. atomic_store(&msg.refs, count() - 1, AK::MemoryOrder::memory_order_release);
  736. VERIFY(msg.refs > 0);
  737. bool need_broadcast = false;
  738. for_each(
  739. [&](Processor& proc) {
  740. if (&proc != &cur_proc) {
  741. if (proc.smp_queue_message(msg))
  742. need_broadcast = true;
  743. }
  744. });
  745. // Now trigger an IPI on all other APs (unless all targets already had messages queued)
  746. if (need_broadcast)
  747. APIC::the().broadcast_ipi();
  748. }
  749. void Processor::smp_broadcast_wait_sync(ProcessorMessage& msg)
  750. {
  751. auto& cur_proc = Processor::current();
  752. VERIFY(!msg.async);
  753. // If synchronous then we must cleanup and return the message back
  754. // to the pool. Otherwise, the last processor to complete it will return it
  755. while (atomic_load(&msg.refs, AK::MemoryOrder::memory_order_consume) != 0) {
  756. // TODO: pause for a bit?
  757. // We need to process any messages that may have been sent to
  758. // us while we're waiting. This also checks if another processor
  759. // may have requested us to halt.
  760. cur_proc.smp_process_pending_messages();
  761. }
  762. smp_cleanup_message(msg);
  763. smp_return_to_pool(msg);
  764. }
  765. void Processor::smp_broadcast(Function<void()> callback, bool async)
  766. {
  767. auto& msg = smp_get_from_pool();
  768. msg.async = async;
  769. msg.type = ProcessorMessage::Callback;
  770. new (msg.callback_storage) ProcessorMessage::CallbackFunction(move(callback));
  771. smp_broadcast_message(msg);
  772. if (!async)
  773. smp_broadcast_wait_sync(msg);
  774. }
  775. void Processor::smp_unicast_message(u32 cpu, ProcessorMessage& msg, bool async)
  776. {
  777. auto& cur_proc = Processor::current();
  778. VERIFY(cpu != cur_proc.get_id());
  779. auto& target_proc = processors()[cpu];
  780. msg.async = async;
  781. dbgln_if(SMP_DEBUG, "SMP[{}]: Send message {} to cpu #{} proc: {}", cur_proc.get_id(), VirtualAddress(&msg), cpu, VirtualAddress(&target_proc));
  782. atomic_store(&msg.refs, 1u, AK::MemoryOrder::memory_order_release);
  783. if (target_proc->smp_queue_message(msg)) {
  784. APIC::the().send_ipi(cpu);
  785. }
  786. if (!async) {
  787. // If synchronous then we must cleanup and return the message back
  788. // to the pool. Otherwise, the last processor to complete it will return it
  789. while (atomic_load(&msg.refs, AK::MemoryOrder::memory_order_consume) != 0) {
  790. // TODO: pause for a bit?
  791. // We need to process any messages that may have been sent to
  792. // us while we're waiting. This also checks if another processor
  793. // may have requested us to halt.
  794. cur_proc.smp_process_pending_messages();
  795. }
  796. smp_cleanup_message(msg);
  797. smp_return_to_pool(msg);
  798. }
  799. }
  800. void Processor::smp_unicast(u32 cpu, Function<void()> callback, bool async)
  801. {
  802. auto& msg = smp_get_from_pool();
  803. msg.type = ProcessorMessage::Callback;
  804. new (msg.callback_storage) ProcessorMessage::CallbackFunction(move(callback));
  805. smp_unicast_message(cpu, msg, async);
  806. }
  807. void Processor::smp_broadcast_flush_tlb(const PageDirectory* page_directory, VirtualAddress vaddr, size_t page_count)
  808. {
  809. auto& msg = smp_get_from_pool();
  810. msg.async = false;
  811. msg.type = ProcessorMessage::FlushTlb;
  812. msg.flush_tlb.page_directory = page_directory;
  813. msg.flush_tlb.ptr = vaddr.as_ptr();
  814. msg.flush_tlb.page_count = page_count;
  815. smp_broadcast_message(msg);
  816. // While the other processors handle this request, we'll flush ours
  817. flush_tlb_local(vaddr, page_count);
  818. // Now wait until everybody is done as well
  819. smp_broadcast_wait_sync(msg);
  820. }
  821. void Processor::smp_broadcast_halt()
  822. {
  823. // We don't want to use a message, because this could have been triggered
  824. // by being out of memory and we might not be able to get a message
  825. for_each(
  826. [&](Processor& proc) {
  827. proc.m_halt_requested.store(true, AK::MemoryOrder::memory_order_release);
  828. });
  829. // Now trigger an IPI on all other APs
  830. APIC::the().broadcast_ipi();
  831. }
  832. void Processor::Processor::halt()
  833. {
  834. if (s_smp_enabled)
  835. smp_broadcast_halt();
  836. halt_this();
  837. }
  838. UNMAP_AFTER_INIT void Processor::deferred_call_pool_init()
  839. {
  840. size_t pool_count = sizeof(m_deferred_call_pool) / sizeof(m_deferred_call_pool[0]);
  841. for (size_t i = 0; i < pool_count; i++) {
  842. auto& entry = m_deferred_call_pool[i];
  843. entry.next = i < pool_count - 1 ? &m_deferred_call_pool[i + 1] : nullptr;
  844. new (entry.handler_storage) DeferredCallEntry::HandlerFunction;
  845. entry.was_allocated = false;
  846. }
  847. m_pending_deferred_calls = nullptr;
  848. m_free_deferred_call_pool_entry = &m_deferred_call_pool[0];
  849. }
  850. void Processor::deferred_call_return_to_pool(DeferredCallEntry* entry)
  851. {
  852. VERIFY(m_in_critical);
  853. VERIFY(!entry->was_allocated);
  854. entry->handler_value() = {};
  855. entry->next = m_free_deferred_call_pool_entry;
  856. m_free_deferred_call_pool_entry = entry;
  857. }
  858. DeferredCallEntry* Processor::deferred_call_get_free()
  859. {
  860. VERIFY(m_in_critical);
  861. if (m_free_deferred_call_pool_entry) {
  862. // Fast path, we have an entry in our pool
  863. auto* entry = m_free_deferred_call_pool_entry;
  864. m_free_deferred_call_pool_entry = entry->next;
  865. VERIFY(!entry->was_allocated);
  866. return entry;
  867. }
  868. auto* entry = new DeferredCallEntry;
  869. new (entry->handler_storage) DeferredCallEntry::HandlerFunction;
  870. entry->was_allocated = true;
  871. return entry;
  872. }
  873. void Processor::deferred_call_execute_pending()
  874. {
  875. VERIFY(m_in_critical);
  876. if (!m_pending_deferred_calls)
  877. return;
  878. auto* pending_list = m_pending_deferred_calls;
  879. m_pending_deferred_calls = nullptr;
  880. // We pulled the stack of pending deferred calls in LIFO order, so we need to reverse the list first
  881. auto reverse_list =
  882. [](DeferredCallEntry* list) -> DeferredCallEntry* {
  883. DeferredCallEntry* rev_list = nullptr;
  884. while (list) {
  885. auto next = list->next;
  886. list->next = rev_list;
  887. rev_list = list;
  888. list = next;
  889. }
  890. return rev_list;
  891. };
  892. pending_list = reverse_list(pending_list);
  893. do {
  894. pending_list->invoke_handler();
  895. // Return the entry back to the pool, or free it
  896. auto* next = pending_list->next;
  897. if (pending_list->was_allocated) {
  898. pending_list->handler_value().~Function();
  899. delete pending_list;
  900. } else
  901. deferred_call_return_to_pool(pending_list);
  902. pending_list = next;
  903. } while (pending_list);
  904. }
  905. void Processor::deferred_call_queue_entry(DeferredCallEntry* entry)
  906. {
  907. VERIFY(m_in_critical);
  908. entry->next = m_pending_deferred_calls;
  909. m_pending_deferred_calls = entry;
  910. }
  911. void Processor::deferred_call_queue(Function<void()> callback)
  912. {
  913. // NOTE: If we are called outside of a critical section and outside
  914. // of an irq handler, the function will be executed before we return!
  915. ScopedCritical critical;
  916. auto& cur_proc = Processor::current();
  917. auto* entry = cur_proc.deferred_call_get_free();
  918. entry->handler_value() = move(callback);
  919. cur_proc.deferred_call_queue_entry(entry);
  920. }
  921. UNMAP_AFTER_INIT void Processor::gdt_init()
  922. {
  923. m_gdt_length = 0;
  924. m_gdtr.address = nullptr;
  925. m_gdtr.limit = 0;
  926. write_raw_gdt_entry(0x0000, 0x00000000, 0x00000000);
  927. write_raw_gdt_entry(GDT_SELECTOR_CODE0, 0x0000ffff, 0x00cf9a00); // code0
  928. write_raw_gdt_entry(GDT_SELECTOR_DATA0, 0x0000ffff, 0x00cf9200); // data0
  929. write_raw_gdt_entry(GDT_SELECTOR_CODE3, 0x0000ffff, 0x00cffa00); // code3
  930. write_raw_gdt_entry(GDT_SELECTOR_DATA3, 0x0000ffff, 0x00cff200); // data3
  931. Descriptor tls_descriptor {};
  932. tls_descriptor.low = tls_descriptor.high = 0;
  933. tls_descriptor.dpl = 3;
  934. tls_descriptor.segment_present = 1;
  935. tls_descriptor.granularity = 0;
  936. tls_descriptor.operation_size64 = 0;
  937. tls_descriptor.operation_size32 = 1;
  938. tls_descriptor.descriptor_type = 1;
  939. tls_descriptor.type = 2;
  940. write_gdt_entry(GDT_SELECTOR_TLS, tls_descriptor); // tls3
  941. Descriptor fs_descriptor {};
  942. fs_descriptor.set_base(VirtualAddress { this });
  943. fs_descriptor.set_limit(sizeof(Processor));
  944. fs_descriptor.dpl = 0;
  945. fs_descriptor.segment_present = 1;
  946. fs_descriptor.granularity = 0;
  947. fs_descriptor.operation_size64 = 0;
  948. fs_descriptor.operation_size32 = 1;
  949. fs_descriptor.descriptor_type = 1;
  950. fs_descriptor.type = 2;
  951. write_gdt_entry(GDT_SELECTOR_PROC, fs_descriptor); // fs0
  952. Descriptor tss_descriptor {};
  953. tss_descriptor.set_base(VirtualAddress { &m_tss });
  954. tss_descriptor.set_limit(sizeof(TSS32));
  955. tss_descriptor.dpl = 0;
  956. tss_descriptor.segment_present = 1;
  957. tss_descriptor.granularity = 0;
  958. tss_descriptor.operation_size64 = 0;
  959. tss_descriptor.operation_size32 = 1;
  960. tss_descriptor.descriptor_type = 0;
  961. tss_descriptor.type = 9;
  962. write_gdt_entry(GDT_SELECTOR_TSS, tss_descriptor); // tss
  963. flush_gdt();
  964. load_task_register(GDT_SELECTOR_TSS);
  965. asm volatile(
  966. "mov %%ax, %%ds\n"
  967. "mov %%ax, %%es\n"
  968. "mov %%ax, %%gs\n"
  969. "mov %%ax, %%ss\n" ::"a"(GDT_SELECTOR_DATA0)
  970. : "memory");
  971. set_fs(GDT_SELECTOR_PROC);
  972. #if ARCH(I386)
  973. // Make sure CS points to the kernel code descriptor.
  974. // clang-format off
  975. asm volatile(
  976. "ljmpl $" __STRINGIFY(GDT_SELECTOR_CODE0) ", $sanity\n"
  977. "sanity:\n");
  978. // clang-format on
  979. #endif
  980. }
  981. }