Kernel: Enable PAE (Physical Address Extension)

Introduce one more (CPU) indirection layer in the paging code: the page
directory pointer table (PDPT). Each PageDirectory now has 4 separate
PageDirectoryEntry arrays, governing 1 GB of VM each.

A really neat side-effect of this is that we can now share the physical
page containing the >=3GB kernel-only address space metadata between
all processes, instead of lazily cloning it on page faults.

This will give us access to the NX (No eXecute) bit, allowing us to
prevent execution of memory that's not supposed to be executed.
This commit is contained in:
Andreas Kling 2019-12-25 11:22:16 +01:00
parent 4883176fd8
commit 52deb09382
Notes: sideshowbarker 2024-07-19 10:41:30 +09:00
7 changed files with 84 additions and 81 deletions

View file

@ -34,7 +34,7 @@ stack_top:
.section .page_tables
.align 4096
page_tables_start:
.skip 4096*3
.skip 4096*9
.section .text

View file

@ -9,6 +9,7 @@
#define PAGE_MASK 0xfffff000
class MemoryManager;
class PageDirectory;
class PageTableEntry;
struct [[gnu::packed]] TSS32
@ -89,12 +90,12 @@ public:
PageTableEntry* page_table_base() { return reinterpret_cast<PageTableEntry*>(m_raw & 0xfffff000u); }
void set_page_table_base(u32 value)
{
m_raw &= 0xfff;
m_raw &= 0x8000000000000fffULL;
m_raw |= value & 0xfffff000;
}
u32 raw() const { return m_raw; }
void copy_from(Badge<MemoryManager>, const PageDirectoryEntry& other) { m_raw = other.m_raw; }
u64 raw() const { return m_raw; }
void copy_from(Badge<PageDirectory>, const PageDirectoryEntry& other) { m_raw = other.m_raw; }
enum Flags {
Present = 1 << 0,
@ -103,6 +104,7 @@ public:
WriteThrough = 1 << 3,
CacheDisabled = 1 << 4,
Global = 1 << 8,
NoExecute = 0x8000000000000000ULL,
};
bool is_present() const { return raw() & Present; }
@ -123,7 +125,10 @@ public:
bool is_global() const { return raw() & Global; }
void set_global(bool b) { set_bit(Global, b); }
void set_bit(u32 bit, bool value)
bool is_execute_disabled() const { return raw() & NoExecute; }
void set_execute_disabled(bool b) { set_bit(NoExecute, b); }
void set_bit(u64 bit, bool value)
{
if (value)
m_raw |= bit;
@ -132,7 +137,7 @@ public:
}
private:
u32 m_raw;
u64 m_raw;
};
class PageTableEntry {
@ -140,11 +145,11 @@ public:
void* physical_page_base() { return reinterpret_cast<void*>(m_raw & 0xfffff000u); }
void set_physical_page_base(u32 value)
{
m_raw &= 0xfff;
m_raw &= 0x8000000000000fffULL;
m_raw |= value & 0xfffff000;
}
u32 raw() const { return m_raw; }
u64 raw() const { return (u32)m_raw; }
enum Flags {
Present = 1 << 0,
@ -153,6 +158,7 @@ public:
WriteThrough = 1 << 3,
CacheDisabled = 1 << 4,
Global = 1 << 8,
NoExecute = 0x8000000000000000ULL,
};
bool is_present() const { return raw() & Present; }
@ -173,7 +179,10 @@ public:
bool is_global() const { return raw() & Global; }
void set_global(bool b) { set_bit(Global, b); }
void set_bit(u32 bit, bool value)
bool is_execute_disabled() const { return raw() & NoExecute; }
void set_execute_disabled(bool b) { set_bit(NoExecute, b); }
void set_bit(u64 bit, bool value)
{
if (value)
m_raw |= bit;
@ -182,11 +191,21 @@ public:
}
private:
u32 m_raw;
u64 m_raw;
};
static_assert(sizeof(PageDirectoryEntry) == 4);
static_assert(sizeof(PageTableEntry) == 4);
static_assert(sizeof(PageDirectoryEntry) == 8);
static_assert(sizeof(PageTableEntry) == 8);
class PageDirectoryPointerTable {
public:
PageDirectoryEntry* directory(size_t index)
{
return (PageDirectoryEntry*)(raw[index] & ~0xfffu);
}
u64 raw[4];
};
class IRQHandler;
struct RegisterDump;

View file

@ -100,7 +100,6 @@ Thread::Thread(Process& process)
m_tss.esp0 = m_kernel_stack_top;
kprintf("Allocated ring0 stack @ %p - %p\n", m_kernel_stack_base, m_kernel_stack_top);
}
m_process.page_directory().update_kernel_mappings();
// HACK: Ring2 SS in the TSS is the current PID.
m_tss.ss2 = m_process.pid();

View file

@ -23,8 +23,11 @@ MemoryManager& MM
MemoryManager::MemoryManager(u32 physical_address_for_kernel_page_tables)
{
m_kernel_page_directory = PageDirectory::create_at_fixed_address(PhysicalAddress(physical_address_for_kernel_page_tables));
m_page_table_zero = (PageTableEntry*)(physical_address_for_kernel_page_tables + PAGE_SIZE);
m_page_table_one = (PageTableEntry*)(physical_address_for_kernel_page_tables + PAGE_SIZE * 2);
for (size_t i = 0; i < 4; ++i) {
m_low_page_tables[i] = (PageTableEntry*)(physical_address_for_kernel_page_tables + PAGE_SIZE * (5 + i));
memset(m_low_page_tables[i], 0, PAGE_SIZE);
}
initialize_paging();
kprintf("MM initialized.\n");
@ -34,21 +37,8 @@ MemoryManager::~MemoryManager()
{
}
void MemoryManager::populate_page_directory(PageDirectory& page_directory)
{
page_directory.m_directory_page = allocate_supervisor_physical_page();
page_directory.entries()[0].copy_from({}, kernel_page_directory().entries()[0]);
page_directory.entries()[1].copy_from({}, kernel_page_directory().entries()[1]);
// Defer to the kernel page tables for 0xC0000000-0xFFFFFFFF
for (int i = 768; i < 1024; ++i)
page_directory.entries()[i].copy_from({}, kernel_page_directory().entries()[i]);
}
void MemoryManager::initialize_paging()
{
memset(m_page_table_zero, 0, PAGE_SIZE);
memset(m_page_table_one, 0, PAGE_SIZE);
#ifdef MM_DEBUG
dbgprintf("MM: Kernel page directory @ %p\n", kernel_page_directory().cr3());
#endif
@ -171,6 +161,12 @@ void MemoryManager::initialize_paging()
"orl $0x80, %eax\n"
"mov %eax, %cr4\n");
// Turn on CR4.PAE
asm volatile(
"mov %cr4, %eax\n"
"orl $0x20, %eax\n"
"mov %eax, %cr4\n");
asm volatile("movl %%eax, %%cr3" ::"a"(kernel_page_directory().cr3()));
asm volatile(
"movl %%cr0, %%eax\n"
@ -186,30 +182,23 @@ void MemoryManager::initialize_paging()
PageTableEntry& MemoryManager::ensure_pte(PageDirectory& page_directory, VirtualAddress vaddr)
{
ASSERT_INTERRUPTS_DISABLED();
u32 page_directory_index = (vaddr.get() >> 22) & 0x3ff;
u32 page_table_index = (vaddr.get() >> 12) & 0x3ff;
u32 page_directory_table_index = (vaddr.get() >> 30) & 0x3;
u32 page_directory_index = (vaddr.get() >> 21) & 0x1ff;
u32 page_table_index = (vaddr.get() >> 12) & 0x1ff;
PageDirectoryEntry& pde = page_directory.entries()[page_directory_index];
PageDirectoryEntry& pde = page_directory.table().directory(page_directory_table_index)[page_directory_index];
if (!pde.is_present()) {
#ifdef MM_DEBUG
dbgprintf("MM: PDE %u not present (requested for V%p), allocating\n", page_directory_index, vaddr.get());
#endif
if (page_directory_index == 0) {
if (page_directory_table_index == 0 && page_directory_index < 4) {
ASSERT(&page_directory == m_kernel_page_directory);
pde.set_page_table_base((u32)m_page_table_zero);
pde.set_user_allowed(false);
pde.set_present(true);
pde.set_writable(true);
pde.set_global(true);
} else if (page_directory_index == 1) {
ASSERT(&page_directory == m_kernel_page_directory);
pde.set_page_table_base((u32)m_page_table_one);
pde.set_page_table_base((u32)m_low_page_tables[page_directory_index]);
pde.set_user_allowed(false);
pde.set_present(true);
pde.set_writable(true);
pde.set_global(true);
} else {
//ASSERT(&page_directory != m_kernel_page_directory.ptr());
auto page_table = allocate_supervisor_physical_page();
#ifdef MM_DEBUG
dbgprintf("MM: PD K%p (%s) at P%p allocated page table #%u (for V%p) at P%p\n",
@ -220,7 +209,6 @@ PageTableEntry& MemoryManager::ensure_pte(PageDirectory& page_directory, Virtual
vaddr.get(),
page_table->paddr().get());
#endif
pde.set_page_table_base(page_table->paddr().get());
pde.set_user_allowed(true);
pde.set_present(true);
@ -322,21 +310,6 @@ PageFaultResponse MemoryManager::handle_page_fault(const PageFault& fault)
dbgprintf("MM: handle_page_fault(%w) at V%p\n", fault.code(), fault.vaddr().get());
#endif
ASSERT(fault.vaddr() != m_quickmap_addr);
if (fault.type() == PageFault::Type::PageNotPresent && fault.vaddr().get() >= 0xc0000000) {
auto* current_page_directory = reinterpret_cast<PageDirectoryEntry*>(cpu_cr3());
u32 page_directory_index = (fault.vaddr().get() >> 22) & 0x3ff;
auto& kernel_pde = kernel_page_directory().entries()[page_directory_index];
auto& current_pde = current_page_directory[page_directory_index];
if (kernel_pde.is_present() && !current_pde.is_present()) {
#ifdef PAGE_FAULT_DEBUG
dbg() << "NP(kernel): Copying new kernel mapping for " << fault.vaddr() << " into current page directory";
#endif
current_pde.copy_from({}, kernel_pde);
flush_tlb(fault.vaddr().page_base());
return PageFaultResponse::Continue;
}
}
auto* region = region_from_vaddr(fault.vaddr());
if (!region) {
kprintf("NP(error) fault at invalid address V%p\n", fault.vaddr().get());
@ -494,11 +467,6 @@ void MemoryManager::enter_process_paging_scope(Process& process)
ASSERT(current);
InterruptDisabler disabler;
// NOTE: To prevent triple-faulting here, we have to ensure that the current stack
// is accessible to the incoming page directory. We achieve this by forcing
// an update of the kernel VM mappings in the entered scope's page directory.
process.page_directory().update_kernel_mappings();
current->tss().cr3 = process.page_directory().cr3();
asm volatile("movl %%eax, %%cr3" ::"a"(process.page_directory().cr3())
: "memory");

View file

@ -42,8 +42,6 @@ public:
PageFaultResponse handle_page_fault(const PageFault&);
void populate_page_directory(PageDirectory&);
void enter_process_paging_scope(Process&);
bool validate_user_stack(const Process&, VirtualAddress) const;
@ -114,8 +112,7 @@ private:
PageTableEntry& ensure_pte(PageDirectory&, VirtualAddress);
RefPtr<PageDirectory> m_kernel_page_directory;
PageTableEntry* m_page_table_zero { nullptr };
PageTableEntry* m_page_table_one { nullptr };
PageTableEntry* m_low_page_tables[4] { nullptr };
VirtualAddress m_quickmap_addr;

View file

@ -24,7 +24,17 @@ RefPtr<PageDirectory> PageDirectory::find_by_cr3(u32 cr3)
PageDirectory::PageDirectory(PhysicalAddress paddr)
: m_range_allocator(VirtualAddress(0xc0000000), 0x3f000000)
{
m_directory_page = PhysicalPage::create(paddr, true, false);
m_directory_table = PhysicalPage::create(paddr, true, false);
m_directory_pages[0] = PhysicalPage::create(paddr.offset(PAGE_SIZE * 1), true, false);
m_directory_pages[1] = PhysicalPage::create(paddr.offset(PAGE_SIZE * 2), true, false);
m_directory_pages[2] = PhysicalPage::create(paddr.offset(PAGE_SIZE * 3), true, false);
m_directory_pages[3] = PhysicalPage::create(paddr.offset(PAGE_SIZE * 4), true, false);
table().raw[0] = (u64)m_directory_pages[0]->paddr().as_ptr() | 1;
table().raw[1] = (u64)m_directory_pages[1]->paddr().as_ptr() | 1;
table().raw[2] = (u64)m_directory_pages[2]->paddr().as_ptr() | 1;
table().raw[3] = (u64)m_directory_pages[3]->paddr().as_ptr() | 1;
InterruptDisabler disabler;
cr3_map().set(cr3(), this);
}
@ -33,7 +43,26 @@ PageDirectory::PageDirectory(Process& process, const RangeAllocator* parent_rang
: m_process(&process)
, m_range_allocator(parent_range_allocator ? RangeAllocator(*parent_range_allocator) : RangeAllocator(VirtualAddress(userspace_range_base), kernelspace_range_base - userspace_range_base))
{
MM.populate_page_directory(*this);
// Set up a userspace page directory
m_directory_table = MM.allocate_supervisor_physical_page();
m_directory_pages[0] = MM.allocate_supervisor_physical_page();
m_directory_pages[1] = MM.allocate_supervisor_physical_page();
m_directory_pages[2] = MM.allocate_supervisor_physical_page();
// Share the top 1 GB of kernel-only mappings (>=3GB or >=0xc0000000)
m_directory_pages[3] = MM.kernel_page_directory().m_directory_pages[3];
table().raw[0] = (u64)m_directory_pages[0]->paddr().as_ptr() | 1;
table().raw[1] = (u64)m_directory_pages[1]->paddr().as_ptr() | 1;
table().raw[2] = (u64)m_directory_pages[2]->paddr().as_ptr() | 1;
table().raw[3] = (u64)m_directory_pages[3]->paddr().as_ptr() | 1;
// Clone bottom 8 MB of mappings from kernel_page_directory
table().directory(0)[0].copy_from({}, MM.kernel_page_directory().table().directory(0)[0]);
table().directory(0)[1].copy_from({}, MM.kernel_page_directory().table().directory(0)[1]);
table().directory(0)[2].copy_from({}, MM.kernel_page_directory().table().directory(0)[2]);
table().directory(0)[3].copy_from({}, MM.kernel_page_directory().table().directory(0)[3]);
InterruptDisabler disabler;
cr3_map().set(cr3(), this);
}
@ -57,11 +86,3 @@ void PageDirectory::flush(VirtualAddress vaddr)
if (this == &MM.kernel_page_directory() || &current->process().page_directory() == this)
MM.flush_tlb(vaddr);
}
void PageDirectory::update_kernel_mappings()
{
// This ensures that the kernel virtual address space is up-to-date in this page directory.
// This may be necessary to avoid triple faulting when entering a process's paging scope
// whose mappings are out-of-date.
memcpy(entries() + 768, MM.kernel_page_directory().entries() + 768, sizeof(PageDirectoryEntry) * 256);
}

View file

@ -21,8 +21,8 @@ public:
~PageDirectory();
u32 cr3() const { return m_directory_page->paddr().get(); }
PageDirectoryEntry* entries() { return reinterpret_cast<PageDirectoryEntry*>(cr3()); }
u32 cr3() const { return m_directory_table->paddr().get(); }
PageDirectoryPointerTable& table() { return *reinterpret_cast<PageDirectoryPointerTable*>(cr3()); }
void flush(VirtualAddress);
@ -31,14 +31,13 @@ public:
Process* process() { return m_process; }
const Process* process() const { return m_process; }
void update_kernel_mappings();
private:
PageDirectory(Process&, const RangeAllocator* parent_range_allocator);
explicit PageDirectory(PhysicalAddress);
Process* m_process { nullptr };
RangeAllocator m_range_allocator;
RefPtr<PhysicalPage> m_directory_page;
RefPtr<PhysicalPage> m_directory_table;
RefPtr<PhysicalPage> m_directory_pages[4];
HashMap<unsigned, RefPtr<PhysicalPage>> m_physical_pages;
};