Selaa lähdekoodia

Kernel: Implement lazy committed page allocation

By designating a committed page pool we can guarantee to have physical
pages available for lazy allocation in mappings. However, when forking
we will overcommit. The assumption is that worst-case it's better for
the fork to die due to insufficient physical memory on COW access than
the parent that created the region. If a fork wants to ensure that all
memory is available (trigger a commit) then it can use madvise.

This also means that fork now can gracefully fail if we don't have
enough physical pages available.
Tom 4 vuotta sitten
vanhempi
commit
b2a52f6208

+ 3 - 1
Kernel/Process.cpp

@@ -145,7 +145,9 @@ Region* Process::allocate_region(const Range& range, const String& name, int pro
 {
     ASSERT(range.is_valid());
     auto vmobject = PurgeableVMObject::create_with_size(range.size());
-    auto region = Region::create_user_accessible(this, range, vmobject, 0, name, prot_to_region_access_flags(prot));
+    if (!vmobject)
+        return nullptr;
+    auto region = Region::create_user_accessible(this, range, vmobject.release_nonnull(), 0, name, prot_to_region_access_flags(prot));
     if (!region->map(page_directory()))
         return nullptr;
     if (should_commit && region->can_commit() && !region->commit())

+ 3 - 3
Kernel/SharedBuffer.h

@@ -48,12 +48,12 @@ private:
     };
 
 public:
-    SharedBuffer(int id, int size)
+    SharedBuffer(int id, NonnullRefPtr<PurgeableVMObject>&& vmobject)
         : m_shbuf_id(id)
-        , m_vmobject(PurgeableVMObject::create_with_size(size))
+        , m_vmobject(move(vmobject))
     {
 #ifdef SHARED_BUFFER_DEBUG
-        dbg() << "Created shared buffer " << m_shbuf_id << " of size " << size;
+        dbg() << "Created shared buffer " << m_shbuf_id << " of size " << m_vmobject->size();
 #endif
     }
 

+ 8 - 1
Kernel/Syscalls/fork.cpp

@@ -88,7 +88,14 @@ pid_t Process::sys$fork(RegisterState& regs)
 #ifdef FORK_DEBUG
             dbg() << "fork: cloning Region{" << &region << "} '" << region.name() << "' @ " << region.vaddr();
 #endif
-            auto& child_region = child->add_region(region.clone());
+            auto region_clone = region.clone();
+            if (!region_clone) {
+                dbg() << "fork: Cannot clone region, insufficient memory";
+                // TODO: tear down new process?
+                return -ENOMEM;
+            }
+
+            auto& child_region = child->add_region(region_clone.release_nonnull());
             child_region.map(child->page_directory());
 
             if (&region == m_master_tls_region.unsafe_ptr())

+ 5 - 1
Kernel/Syscalls/shbuf.cpp

@@ -52,10 +52,14 @@ int Process::sys$shbuf_create(int size, void** buffer)
         return -EINVAL;
     size = PAGE_ROUND_UP(size);
 
+    auto vmobject = PurgeableVMObject::create_with_size(size);
+    if (!vmobject)
+        return -ENOMEM;
+
     LOCKER(shared_buffers().lock());
     static int s_next_shbuf_id;
     int shbuf_id = ++s_next_shbuf_id;
-    auto shared_buffer = make<SharedBuffer>(shbuf_id, size);
+    auto shared_buffer = make<SharedBuffer>(shbuf_id, vmobject.release_nonnull());
     shared_buffer->share_with(m_pid);
 
     void* address = shared_buffer->ref_for_process_and_get_address(*this);

+ 11 - 4
Kernel/VM/AnonymousVMObject.cpp

@@ -51,13 +51,15 @@ NonnullRefPtr<AnonymousVMObject> AnonymousVMObject::create_with_physical_page(Ph
     return vmobject;
 }
 
-AnonymousVMObject::AnonymousVMObject(size_t size)
+AnonymousVMObject::AnonymousVMObject(size_t size, bool initialize_pages)
     : VMObject(size)
 {
+    if (initialize_pages) {
 #ifndef MAP_SHARED_ZERO_PAGE_LAZILY
-    for (size_t i = 0; i < page_count(); ++i)
-        physical_pages()[i] = MM.shared_zero_page();
+        for (size_t i = 0; i < page_count(); ++i)
+            physical_pages()[i] = MM.shared_zero_page();
 #endif
+    }
 }
 
 AnonymousVMObject::AnonymousVMObject(PhysicalAddress paddr, size_t size)
@@ -77,9 +79,14 @@ AnonymousVMObject::~AnonymousVMObject()
 {
 }
 
-NonnullRefPtr<VMObject> AnonymousVMObject::clone()
+RefPtr<VMObject> AnonymousVMObject::clone()
 {
     return adopt(*new AnonymousVMObject(*this));
 }
 
+RefPtr<PhysicalPage> AnonymousVMObject::allocate_committed_page(size_t)
+{
+    return {};
+}
+
 }

+ 4 - 2
Kernel/VM/AnonymousVMObject.h

@@ -38,10 +38,12 @@ public:
     static NonnullRefPtr<AnonymousVMObject> create_with_size(size_t);
     static RefPtr<AnonymousVMObject> create_for_physical_range(PhysicalAddress, size_t);
     static NonnullRefPtr<AnonymousVMObject> create_with_physical_page(PhysicalPage&);
-    virtual NonnullRefPtr<VMObject> clone() override;
+    virtual RefPtr<VMObject> clone() override;
+
+    virtual RefPtr<PhysicalPage> allocate_committed_page(size_t);
 
 protected:
-    explicit AnonymousVMObject(size_t);
+    explicit AnonymousVMObject(size_t, bool initialize_pages = true);
     explicit AnonymousVMObject(const AnonymousVMObject&);
 
     virtual const char* class_name() const override { return "AnonymousVMObject"; }

+ 1 - 1
Kernel/VM/ContiguousVMObject.cpp

@@ -58,7 +58,7 @@ ContiguousVMObject::~ContiguousVMObject()
 {
 }
 
-NonnullRefPtr<VMObject> ContiguousVMObject::clone()
+RefPtr<VMObject> ContiguousVMObject::clone()
 {
     ASSERT_NOT_REACHED();
 }

+ 1 - 1
Kernel/VM/ContiguousVMObject.h

@@ -42,7 +42,7 @@ private:
     explicit ContiguousVMObject(const ContiguousVMObject&);
 
     virtual const char* class_name() const override { return "ContiguousVMObject"; }
-    virtual NonnullRefPtr<VMObject> clone() override;
+    virtual RefPtr<VMObject> clone() override;
 
     ContiguousVMObject& operator=(const ContiguousVMObject&) = delete;
     ContiguousVMObject& operator=(ContiguousVMObject&&) = delete;

+ 71 - 7
Kernel/VM/MemoryManager.cpp

@@ -78,7 +78,19 @@ MemoryManager::MemoryManager()
     write_cr3(kernel_page_directory().cr3());
     protect_kernel_image();
 
-    m_shared_zero_page = allocate_user_physical_page();
+    // We're temporarily "committing" to two pages that we need to allocate below
+    if (!commit_user_physical_pages(2))
+        ASSERT_NOT_REACHED();
+
+    m_shared_zero_page = allocate_committed_user_physical_page();
+
+    // We're wasting a page here, we just need a special tag (physical
+    // address) so that we know when we need to lazily allocate a page
+    // that we should be drawing this page from the committed pool rather
+    // than potentially failing if no pages are available anymore.
+    // By using a tag we don't have to query the VMObject for every page
+    // whether it was committed or not
+    m_lazy_committed_page = allocate_committed_user_physical_page();
 }
 
 MemoryManager::~MemoryManager()
@@ -192,6 +204,9 @@ void MemoryManager::parse_memory_map()
 
     ASSERT(m_super_physical_pages > 0);
     ASSERT(m_user_physical_pages > 0);
+
+    // We start out with no committed pages
+    m_user_physical_pages_uncommitted = m_user_physical_pages;
 }
 
 PageTableEntry* MemoryManager::pte(PageDirectory& page_directory, VirtualAddress vaddr)
@@ -469,6 +484,28 @@ OwnPtr<Region> MemoryManager::allocate_kernel_region_with_vmobject(VMObject& vmo
     return allocate_kernel_region_with_vmobject(range, vmobject, name, access, user_accessible, cacheable);
 }
 
+bool MemoryManager::commit_user_physical_pages(size_t page_count)
+{
+    ASSERT(page_count > 0);
+    ScopedSpinLock lock(s_mm_lock);
+    if (m_user_physical_pages_uncommitted < page_count)
+        return false;
+
+    m_user_physical_pages_uncommitted -= page_count;
+    m_user_physical_pages_committed += page_count;
+    return true;
+}
+
+void MemoryManager::uncommit_user_physical_pages(size_t page_count)
+{
+    ASSERT(page_count > 0);
+    ScopedSpinLock lock(s_mm_lock);
+    ASSERT(m_user_physical_pages_committed >= page_count);
+
+    m_user_physical_pages_uncommitted += page_count;
+    m_user_physical_pages_committed -= page_count;
+}
+
 void MemoryManager::deallocate_user_physical_page(const PhysicalPage& page)
 {
     ScopedSpinLock lock(s_mm_lock);
@@ -481,6 +518,10 @@ void MemoryManager::deallocate_user_physical_page(const PhysicalPage& page)
         region.return_page(page);
         --m_user_physical_pages_used;
 
+        // Always return pages to the uncommitted pool. Pages that were
+        // committed and allocated are only freed upon request. Once
+        // returned there is no guarantee being able to get them back.
+        ++m_user_physical_pages_uncommitted;
         return;
     }
 
@@ -488,22 +529,47 @@ void MemoryManager::deallocate_user_physical_page(const PhysicalPage& page)
     ASSERT_NOT_REACHED();
 }
 
-RefPtr<PhysicalPage> MemoryManager::find_free_user_physical_page()
+RefPtr<PhysicalPage> MemoryManager::find_free_user_physical_page(bool committed)
 {
     ASSERT(s_mm_lock.is_locked());
     RefPtr<PhysicalPage> page;
+    if (committed) {
+        // Draw from the committed pages pool. We should always have these pages available
+        ASSERT(m_user_physical_pages_committed > 0);
+        m_user_physical_pages_committed--;
+    } else {
+        // We need to make sure we don't touch pages that we have committed to
+        if (m_user_physical_pages_uncommitted == 0)
+            return {};
+        m_user_physical_pages_uncommitted--;
+    }
     for (auto& region : m_user_physical_regions) {
         page = region.take_free_page(false);
-        if (!page.is_null())
+        if (!page.is_null()) {
+            ++m_user_physical_pages_used;
             break;
+        }
     }
+    ASSERT(!committed || !page.is_null());
     return page;
 }
 
+NonnullRefPtr<PhysicalPage> MemoryManager::allocate_committed_user_physical_page(ShouldZeroFill should_zero_fill)
+{
+    ScopedSpinLock lock(s_mm_lock);
+    auto page = find_free_user_physical_page(true);
+    if (should_zero_fill == ShouldZeroFill::Yes) {
+        auto* ptr = quickmap_page(*page);
+        memset(ptr, 0, PAGE_SIZE);
+        unquickmap_page();
+    }
+    return page.release_nonnull();
+}
+
 RefPtr<PhysicalPage> MemoryManager::allocate_user_physical_page(ShouldZeroFill should_zero_fill, bool* did_purge)
 {
     ScopedSpinLock lock(s_mm_lock);
-    auto page = find_free_user_physical_page();
+    auto page = find_free_user_physical_page(false);
     bool purged_pages = false;
 
     if (!page) {
@@ -515,7 +581,7 @@ RefPtr<PhysicalPage> MemoryManager::allocate_user_physical_page(ShouldZeroFill s
             int purged_page_count = static_cast<PurgeableVMObject&>(vmobject).purge_with_interrupts_disabled({});
             if (purged_page_count) {
                 klog() << "MM: Purge saved the day! Purged " << purged_page_count << " pages from PurgeableVMObject{" << &vmobject << "}";
-                page = find_free_user_physical_page();
+                page = find_free_user_physical_page(false);
                 purged_pages = true;
                 ASSERT(page);
                 return IterationDecision::Break;
@@ -541,8 +607,6 @@ RefPtr<PhysicalPage> MemoryManager::allocate_user_physical_page(ShouldZeroFill s
 
     if (did_purge)
         *did_purge = purged_pages;
-
-    ++m_user_physical_pages_used;
     return page;
 }
 

+ 13 - 1
Kernel/VM/MemoryManager.h

@@ -110,6 +110,9 @@ public:
         Yes
     };
 
+    bool commit_user_physical_pages(size_t);
+    void uncommit_user_physical_pages(size_t);
+    NonnullRefPtr<PhysicalPage> allocate_committed_user_physical_page(ShouldZeroFill = ShouldZeroFill::Yes);
     RefPtr<PhysicalPage> allocate_user_physical_page(ShouldZeroFill = ShouldZeroFill::Yes, bool* did_purge = nullptr);
     RefPtr<PhysicalPage> allocate_supervisor_physical_page();
     NonnullRefPtrVector<PhysicalPage> allocate_contiguous_supervisor_physical_pages(size_t size);
@@ -155,6 +158,7 @@ public:
     void dump_kernel_regions();
 
     PhysicalPage& shared_zero_page() { return *m_shared_zero_page; }
+    PhysicalPage& lazy_committed_page() { return *m_lazy_committed_page; }
 
     PageDirectory& kernel_page_directory() { return *m_kernel_page_directory; }
 
@@ -185,7 +189,7 @@ private:
 
     static Region* find_region_from_vaddr(VirtualAddress);
 
-    RefPtr<PhysicalPage> find_free_user_physical_page();
+    RefPtr<PhysicalPage> find_free_user_physical_page(bool);
     u8* quickmap_page(PhysicalPage&);
     void unquickmap_page();
 
@@ -200,9 +204,12 @@ private:
     RefPtr<PhysicalPage> m_low_page_table;
 
     RefPtr<PhysicalPage> m_shared_zero_page;
+    RefPtr<PhysicalPage> m_lazy_committed_page;
 
     unsigned m_user_physical_pages { 0 };
     unsigned m_user_physical_pages_used { 0 };
+    unsigned m_user_physical_pages_committed { 0 };
+    unsigned m_user_physical_pages_uncommitted { 0 };
     unsigned m_super_physical_pages { 0 };
     unsigned m_super_physical_pages_used { 0 };
 
@@ -250,4 +257,9 @@ inline bool PhysicalPage::is_shared_zero_page() const
     return this == &MM.shared_zero_page();
 }
 
+inline bool PhysicalPage::is_lazy_committed_page() const
+{
+    return this == &MM.lazy_committed_page();
+}
+
 }

+ 1 - 0
Kernel/VM/PhysicalPage.h

@@ -64,6 +64,7 @@ public:
     u32 ref_count() const { return m_ref_count.load(AK::memory_order_consume); }
 
     bool is_shared_zero_page() const;
+    bool is_lazy_committed_page() const;
 
 private:
     PhysicalPage(PhysicalAddress paddr, bool supervisor, bool may_return_to_freelist = true);

+ 1 - 1
Kernel/VM/PrivateInodeVMObject.cpp

@@ -34,7 +34,7 @@ NonnullRefPtr<PrivateInodeVMObject> PrivateInodeVMObject::create_with_inode(Inod
     return adopt(*new PrivateInodeVMObject(inode, inode.size()));
 }
 
-NonnullRefPtr<VMObject> PrivateInodeVMObject::clone()
+RefPtr<VMObject> PrivateInodeVMObject::clone()
 {
     return adopt(*new PrivateInodeVMObject(*this));
 }

+ 1 - 1
Kernel/VM/PrivateInodeVMObject.h

@@ -39,7 +39,7 @@ public:
     virtual ~PrivateInodeVMObject() override;
 
     static NonnullRefPtr<PrivateInodeVMObject> create_with_inode(Inode&);
-    virtual NonnullRefPtr<VMObject> clone() override;
+    virtual RefPtr<VMObject> clone() override;
 
 private:
     virtual bool is_private_inode() const override { return true; }

+ 139 - 13
Kernel/VM/PurgeableVMObject.cpp

@@ -44,10 +44,10 @@ inline LogStream& operator<<(const LogStream& stream, const VolatilePageRange& r
 
 static void dump_volatile_page_ranges(const Vector<VolatilePageRange>& ranges)
 {
-   for (size_t i = 0; i < ranges.size(); i++) {
-       const auto& range = ranges[i];
-       klog() << "  [" << i << "] " << range;
-   }
+    for (size_t i = 0; i < ranges.size(); i++) {
+        const auto& range = ranges[i];
+        klog() << "  [" << i << "] " << range;
+    }
 }
 #endif
 
@@ -185,7 +185,7 @@ bool VolatilePageRanges::intersects(const VolatilePageRange& range) const
 }
 
 PurgeablePageRanges::PurgeablePageRanges(const VMObject& vmobject)
-    : m_volatile_ranges({0, vmobject.is_purgeable() ? static_cast<const PurgeableVMObject&>(vmobject).page_count() : 0})
+    : m_volatile_ranges({ 0, vmobject.is_purgeable() ? static_cast<const PurgeableVMObject&>(vmobject).page_count() : 0 })
 {
 }
 
@@ -193,8 +193,23 @@ bool PurgeablePageRanges::add_volatile_range(const VolatilePageRange& range)
 {
     if (range.is_empty())
         return false;
+
+    // Since we may need to call into PurgeableVMObject we need to acquire
+    // its lock as well, and acquire it first. This is important so that
+    // we don't deadlock when a page fault (e.g. on another processor)
+    // happens that is meant to lazy-allocate a committed page. It would
+    // call into PurgeableVMObject::range_made_volatile, which then would
+    // also call into this object and need to acquire m_lock. By acquiring
+    // the vmobject lock first in both cases, we avoid deadlocking.
+    // We can access m_vmobject without any locks for that purpose because
+    // add_volatile_range and remove_volatile_range can only be called
+    // by same object that calls set_vmobject.
+    ScopedSpinLock vmobject_lock(m_vmobject->m_lock);
     ScopedSpinLock lock(m_volatile_ranges_lock);
-    return m_volatile_ranges.add(range);
+    bool added = m_volatile_ranges.add(range);
+    if (added)
+        m_vmobject->range_made_volatile(range);
+    return added;
 }
 
 bool PurgeablePageRanges::remove_volatile_range(const VolatilePageRange& range, bool& was_purged)
@@ -202,6 +217,7 @@ bool PurgeablePageRanges::remove_volatile_range(const VolatilePageRange& range,
     if (range.is_empty())
         return false;
     ScopedSpinLock lock(m_volatile_ranges_lock);
+    ASSERT(m_vmobject);
     return m_volatile_ranges.remove(range, was_purged);
 }
 
@@ -213,35 +229,73 @@ bool PurgeablePageRanges::is_volatile_range(const VolatilePageRange& range) cons
     return m_volatile_ranges.intersects(range);
 }
 
+bool PurgeablePageRanges::is_volatile(size_t index) const
+{
+    ScopedSpinLock lock(m_volatile_ranges_lock);
+    return m_volatile_ranges.contains(index);
+}
+
 void PurgeablePageRanges::set_was_purged(const VolatilePageRange& range)
 {
     ScopedSpinLock lock(m_volatile_ranges_lock);
-    m_volatile_ranges.add({range.base, range.count, true});
+    m_volatile_ranges.add({ range.base, range.count, true });
+}
+
+void PurgeablePageRanges::set_vmobject(PurgeableVMObject* vmobject)
+{
+    // No lock needed here
+    if (vmobject) {
+        ASSERT(!m_vmobject);
+        m_vmobject = vmobject;
+    } else {
+        ASSERT(m_vmobject);
+        m_vmobject = nullptr;
+    }
 }
 
-NonnullRefPtr<PurgeableVMObject> PurgeableVMObject::create_with_size(size_t size)
+RefPtr<PurgeableVMObject> PurgeableVMObject::create_with_size(size_t size)
 {
+    // We need to attempt to commit before actually creating the object
+    if (!MM.commit_user_physical_pages(ceil_div(size, PAGE_SIZE)))
+        return {};
     return adopt(*new PurgeableVMObject(size));
 }
 
 PurgeableVMObject::PurgeableVMObject(size_t size)
-    : AnonymousVMObject(size)
+    : AnonymousVMObject(size, false)
+    , m_unused_committed_pages(page_count())
 {
+    for (size_t i = 0; i < page_count(); ++i)
+        physical_pages()[i] = MM.lazy_committed_page();
 }
 
 PurgeableVMObject::PurgeableVMObject(const PurgeableVMObject& other)
     : AnonymousVMObject(other)
     , m_purgeable_ranges() // do *not* clone this
+    , m_unused_committed_pages(other.m_unused_committed_pages)
 {
-    // TODO: what about m_lock?
+    // We can't really "copy" a spinlock. But we're holding it. Clear in the clone
+    ASSERT(other.m_lock.is_locked());
+    m_lock.initialize();
 }
 
 PurgeableVMObject::~PurgeableVMObject()
 {
+    if (m_unused_committed_pages > 0)
+        MM.uncommit_user_physical_pages(m_unused_committed_pages);
 }
 
-NonnullRefPtr<VMObject> PurgeableVMObject::clone()
+RefPtr<VMObject> PurgeableVMObject::clone()
 {
+    // We need to acquire our lock so we copy a sane state
+    ScopedSpinLock lock(m_lock);
+    if (m_unused_committed_pages > 0) {
+        // We haven't used up all committed pages. In order to be able
+        // to clone ourselves, we need to be able to commit the same number
+        // of pages first
+        if (!MM.commit_user_physical_pages(m_unused_committed_pages))
+            return {};
+    }
     return adopt(*new PurgeableVMObject(*this));
 }
 
@@ -275,8 +329,10 @@ int PurgeableVMObject::purge_impl()
         auto range_end = range.base + range.count;
         for (size_t i = range.base; i < range_end; i++) {
             auto& phys_page = m_physical_pages[i];
-            if (phys_page && !phys_page->is_shared_zero_page())
+            if (phys_page && !phys_page->is_shared_zero_page()) {
+                ASSERT(!phys_page->is_lazy_committed_page());
                 ++purged_in_range;
+            }
             phys_page = MM.shared_zero_page();
         }
 
@@ -291,7 +347,7 @@ int PurgeableVMObject::purge_impl()
                     } else {
                         klog() << "Purged " << purged_in_range << " pages from region " << region.name() << " (no ownership) at " << region.vaddr_from_page_index(range.base) << " - " << region.vaddr_from_page_index(range.base + range.count);
                     }
-                    region.remap_page_range(range.base, range.count, false);
+                    region.remap_page_range(range.base, range.count);
                 }
             });
         }
@@ -303,6 +359,7 @@ int PurgeableVMObject::purge_impl()
 void PurgeableVMObject::register_purgeable_page_ranges(PurgeablePageRanges& purgeable_page_ranges)
 {
     ScopedSpinLock lock(m_lock);
+    purgeable_page_ranges.set_vmobject(this);
     ASSERT(!m_purgeable_ranges.contains_slow(&purgeable_page_ranges));
     m_purgeable_ranges.append(&purgeable_page_ranges);
 }
@@ -313,6 +370,7 @@ void PurgeableVMObject::unregister_purgeable_page_ranges(PurgeablePageRanges& pu
     for (size_t i = 0; i < m_purgeable_ranges.size(); i++) {
         if (m_purgeable_ranges[i] != &purgeable_page_ranges)
             continue;
+        purgeable_page_ranges.set_vmobject(nullptr);
         m_purgeable_ranges.remove(i);
         return;
     }
@@ -330,4 +388,72 @@ bool PurgeableVMObject::is_any_volatile() const
     return false;
 }
 
+size_t PurgeableVMObject::remove_lazy_commit_pages(const VolatilePageRange& range)
+{
+    ASSERT(m_lock.is_locked());
+
+    size_t removed_count = 0;
+    auto range_end = range.base + range.count;
+    for (size_t i = range.base; i < range_end; i++) {
+        auto& phys_page = m_physical_pages[i];
+        if (phys_page && phys_page->is_lazy_committed_page()) {
+            phys_page = MM.shared_zero_page();
+            removed_count++;
+            ASSERT(m_unused_committed_pages > 0);
+            m_unused_committed_pages--;
+            //            if (--m_unused_committed_pages == 0)
+            //                break;
+        }
+    }
+    return removed_count;
+}
+
+void PurgeableVMObject::range_made_volatile(const VolatilePageRange& range)
+{
+    ASSERT(m_lock.is_locked());
+
+    if (m_unused_committed_pages == 0)
+        return;
+
+    // We need to check this range for any pages that are marked for
+    // lazy committed allocation and turn them into shared zero pages
+    // and also adjust the m_unused_committed_pages for each such page.
+    // Take into account all the other views as well.
+    size_t uncommit_page_count = 0;
+    for_each_volatile_range([&](const auto& r) {
+        auto intersected = range.intersected(r);
+        if (!intersected.is_empty()) {
+            uncommit_page_count += remove_lazy_commit_pages(intersected);
+            //            if (m_unused_committed_pages == 0)
+            //                return IterationDecision::Break;
+        }
+        return IterationDecision::Continue;
+    });
+
+    // Return those committed pages back to the system
+    if (uncommit_page_count > 0)
+        MM.uncommit_user_physical_pages(uncommit_page_count);
+}
+
+RefPtr<PhysicalPage> PurgeableVMObject::allocate_committed_page(size_t page_index)
+{
+    {
+        ScopedSpinLock lock(m_lock);
+
+        ASSERT(m_unused_committed_pages > 0);
+
+        // We should't have any committed page tags in volatile regions
+        ASSERT([&]() {
+            for (auto* purgeable_ranges : m_purgeable_ranges) {
+                if (purgeable_ranges->is_volatile(page_index))
+                    return false;
+            }
+            return true;
+        }());
+
+        m_unused_committed_pages--;
+    }
+    return MM.allocate_committed_user_physical_page(MemoryManager::ShouldZeroFill::Yes);
+}
+
 }

+ 24 - 3
Kernel/VM/PurgeableVMObject.h

@@ -136,6 +136,10 @@ public:
     }
 
     bool intersects(const VolatilePageRange&) const;
+    bool contains(size_t index) const
+    {
+        return intersects({ index, 1 });
+    }
 
     bool add(const VolatilePageRange&);
     bool remove(const VolatilePageRange&, bool&);
@@ -152,6 +156,7 @@ class PurgeableVMObject;
 
 class PurgeablePageRanges {
     friend class PurgeableVMObject;
+
 public:
     PurgeablePageRanges(const VMObject&);
 
@@ -168,23 +173,32 @@ public:
     bool add_volatile_range(const VolatilePageRange& range);
     bool remove_volatile_range(const VolatilePageRange& range, bool& was_purged);
     bool is_volatile_range(const VolatilePageRange& range) const;
+    bool is_volatile(size_t) const;
 
     bool is_empty() const { return m_volatile_ranges.is_empty(); }
 
     void set_was_purged(const VolatilePageRange&);
 
     const VolatilePageRanges& volatile_ranges() const { return m_volatile_ranges; }
+
 protected:
+    void set_vmobject(PurgeableVMObject*);
+
     VolatilePageRanges m_volatile_ranges;
-    mutable SpinLock<u8> m_volatile_ranges_lock;
+    mutable RecursiveSpinLock m_volatile_ranges_lock;
+    PurgeableVMObject* m_vmobject { nullptr };
 };
 
 class PurgeableVMObject final : public AnonymousVMObject {
+    friend class PurgeablePageRanges;
+
 public:
     virtual ~PurgeableVMObject() override;
 
-    static NonnullRefPtr<PurgeableVMObject> create_with_size(size_t);
-    virtual NonnullRefPtr<VMObject> clone() override;
+    static RefPtr<PurgeableVMObject> create_with_size(size_t);
+    virtual RefPtr<VMObject> clone() override;
+
+    virtual RefPtr<PhysicalPage> allocate_committed_page(size_t) override;
 
     void register_purgeable_page_ranges(PurgeablePageRanges&);
     void unregister_purgeable_page_ranges(PurgeablePageRanges&);
@@ -202,11 +216,13 @@ public:
         // volatile ranges that all share, because those are the only
         // pages we can actually purge
         for (auto* purgeable_range : m_purgeable_ranges) {
+            ScopedSpinLock purgeable_lock(purgeable_range->m_volatile_ranges_lock);
             for (auto& r1 : purgeable_range->volatile_ranges().ranges()) {
                 VolatilePageRange range(r1);
                 for (auto* purgeable_range2 : m_purgeable_ranges) {
                     if (purgeable_range2 == purgeable_range)
                         continue;
+                    ScopedSpinLock purgeable2_lock(purgeable_range2->m_volatile_ranges_lock);
                     if (purgeable_range2->is_empty()) {
                         // If just one doesn't allow any purging, we can
                         // immediately bail
@@ -230,6 +246,8 @@ public:
         return IterationDecision::Continue;
     }
 
+    size_t get_lazy_committed_page_count() const;
+
 private:
     explicit PurgeableVMObject(size_t);
     explicit PurgeableVMObject(const PurgeableVMObject&);
@@ -238,6 +256,8 @@ private:
 
     int purge_impl();
     void set_was_purged(const VolatilePageRange&);
+    size_t remove_lazy_commit_pages(const VolatilePageRange&);
+    void range_made_volatile(const VolatilePageRange&);
 
     PurgeableVMObject& operator=(const PurgeableVMObject&) = delete;
     PurgeableVMObject& operator=(PurgeableVMObject&&) = delete;
@@ -247,6 +267,7 @@ private:
 
     Vector<PurgeablePageRanges*> m_purgeable_ranges;
     mutable SpinLock<u8> m_lock;
+    size_t m_unused_committed_pages { 0 };
 };
 
 }

+ 38 - 22
Kernel/VM/Region.cpp

@@ -87,7 +87,7 @@ void Region::unregister_purgeable_page_ranges()
     }
 }
 
-NonnullOwnPtr<Region> Region::clone()
+OwnPtr<Region> Region::clone()
 {
     ASSERT(Process::current());
 
@@ -122,13 +122,17 @@ NonnullOwnPtr<Region> Region::clone()
     if (vmobject().is_inode())
         ASSERT(vmobject().is_private_inode());
 
+    auto vmobject_clone = m_vmobject->clone();
+    if (!vmobject_clone)
+        return {};
+
 #ifdef MM_DEBUG
     dbg() << "Region::clone(): CoWing " << name() << " (" << vaddr() << ")";
 #endif
     // Set up a COW region. The parent (this) region becomes COW as well!
     ensure_cow_map().fill(true);
     remap();
-    auto clone_region = Region::create_user_accessible(get_owner().ptr(), m_range, m_vmobject->clone(), m_offset_in_vmobject, m_name, m_access);
+    auto clone_region = Region::create_user_accessible(get_owner().ptr(), m_range, vmobject_clone.release_nonnull(), m_offset_in_vmobject, m_name, m_access);
     clone_region->set_purgeable_page_ranges(*this);
     clone_region->ensure_cow_map();
     if (m_stack) {
@@ -187,7 +191,7 @@ auto Region::set_volatile(VirtualAddress vaddr, size_t size, bool is_volatile, b
             // Attempt to remap the page range. We want to make sure we have
             // enough memory, if not we need to inform the caller of that
             // fact
-            if (!remap_page_range(first_page_index, last_page_index - first_page_index, true))
+            if (!remap_page_range(first_page_index, last_page_index - first_page_index))
                 return SetVolatileError::OutOfMemory;
         }
     }
@@ -224,10 +228,15 @@ bool Region::commit(size_t page_index)
     auto& vmobject_physical_page_entry = physical_page_slot(page_index);
     if (!vmobject_physical_page_entry.is_null() && !vmobject_physical_page_entry->is_shared_zero_page())
         return true;
-    auto physical_page = MM.allocate_user_physical_page(MemoryManager::ShouldZeroFill::Yes);
-    if (!physical_page) {
-        klog() << "MM: commit was unable to allocate a physical page";
-        return false;
+    RefPtr<PhysicalPage> physical_page;
+    if (vmobject_physical_page_entry->is_lazy_committed_page()) {
+        physical_page = static_cast<AnonymousVMObject&>(*m_vmobject).allocate_committed_page(page_index);
+    } else {
+        physical_page = MM.allocate_user_physical_page(MemoryManager::ShouldZeroFill::Yes);
+        if (!physical_page) {
+            klog() << "MM: commit was unable to allocate a physical page";
+            return false;
+        }
     }
     vmobject_physical_page_entry = move(physical_page);
     remap_page(page_index, false); // caller is in charge of flushing tlb
@@ -292,7 +301,7 @@ NonnullOwnPtr<Region> Region::create_kernel_only(const Range& range, NonnullRefP
 bool Region::should_cow(size_t page_index) const
 {
     auto* page = physical_page(page_index);
-    if (page && page->is_shared_zero_page())
+    if (page && (page->is_shared_zero_page() || page->is_lazy_committed_page()))
         return true;
     if (m_shared)
         return false;
@@ -344,7 +353,7 @@ bool Region::map_individual_page_impl(size_t page_index)
     return true;
 }
 
-bool Region::remap_page_range(size_t page_index, size_t page_count, bool do_commit)
+bool Region::remap_page_range(size_t page_index, size_t page_count)
 {
     bool success = true;
     ScopedSpinLock lock(s_mm_lock);
@@ -352,10 +361,6 @@ bool Region::remap_page_range(size_t page_index, size_t page_count, bool do_comm
     ScopedSpinLock page_lock(m_page_directory->get_lock());
     size_t index = page_index;
     while (index < page_index + page_count) {
-        if (do_commit && !commit(index)) {
-            success = false;
-            break;
-        }
         if (!map_individual_page_impl(index)) {
             success = false;
             break;
@@ -455,9 +460,16 @@ PageFaultResponse Region::handle_fault(const PageFault& fault)
 #endif
             return handle_inode_fault(page_index_in_region);
         }
+
+        auto& page_slot = physical_page_slot(page_index_in_region);
+        if (page_slot->is_lazy_committed_page()) {
+            page_slot = static_cast<AnonymousVMObject&>(*m_vmobject).allocate_committed_page(page_index_in_region);
+            remap_page(page_index_in_region);
+            return PageFaultResponse::Continue;
+        }
 #ifdef MAP_SHARED_ZERO_PAGE_LAZILY
         if (fault.is_read()) {
-            physical_page_slot(page_index_in_region) = MM.shared_zero_page();
+            page_slot = MM.shared_zero_page();
             remap_page(page_index_in_region);
             return PageFaultResponse::Continue;
         }
@@ -472,7 +484,8 @@ PageFaultResponse Region::handle_fault(const PageFault& fault)
 #ifdef PAGE_FAULT_DEBUG
         dbg() << "PV(cow) fault in Region{" << this << "}[" << page_index_in_region << "]";
 #endif
-        if (physical_page(page_index_in_region)->is_shared_zero_page()) {
+        auto* phys_page = physical_page(page_index_in_region);
+        if (phys_page->is_shared_zero_page() || phys_page->is_lazy_committed_page()) {
 #ifdef PAGE_FAULT_DEBUG
             dbg() << "NP(zero) fault in Region{" << this << "}[" << page_index_in_region << "]";
 #endif
@@ -493,7 +506,7 @@ PageFaultResponse Region::handle_zero_fault(size_t page_index_in_region)
 
     auto& page_slot = physical_page_slot(page_index_in_region);
 
-    if (!page_slot.is_null() && !page_slot->is_shared_zero_page()) {
+    if (!page_slot.is_null() && !page_slot->is_shared_zero_page() && !page_slot->is_lazy_committed_page()) {
 #ifdef PAGE_FAULT_DEBUG
         dbg() << "MM: zero_page() but page already present. Fine with me!";
 #endif
@@ -506,16 +519,19 @@ PageFaultResponse Region::handle_zero_fault(size_t page_index_in_region)
     if (current_thread != nullptr)
         current_thread->did_zero_fault();
 
-    auto page = MM.allocate_user_physical_page(MemoryManager::ShouldZeroFill::Yes);
-    if (page.is_null()) {
-        klog() << "MM: handle_zero_fault was unable to allocate a physical page";
-        return PageFaultResponse::OutOfMemory;
+    if (page_slot->is_lazy_committed_page()) {
+        page_slot = static_cast<AnonymousVMObject&>(*m_vmobject).allocate_committed_page(page_index_in_region);
+    } else {
+        page_slot = MM.allocate_user_physical_page(MemoryManager::ShouldZeroFill::Yes);
+        if (page_slot.is_null()) {
+            klog() << "MM: handle_zero_fault was unable to allocate a physical page";
+            return PageFaultResponse::OutOfMemory;
+        }
     }
 
 #ifdef PAGE_FAULT_DEBUG
-    dbg() << "      >> ZERO " << page->paddr();
+    dbg() << "      >> ZERO " << page_slot->paddr();
 #endif
-    page_slot = move(page);
     if (!remap_page(page_index_in_region)) {
         klog() << "MM: handle_zero_fault was unable to allocate a page table to map " << page_slot;
         return PageFaultResponse::OutOfMemory;

+ 2 - 2
Kernel/VM/Region.h

@@ -105,7 +105,7 @@ public:
 
     PageFaultResponse handle_fault(const PageFault&);
 
-    NonnullOwnPtr<Region> clone();
+    OwnPtr<Region> clone();
 
     bool contains(VirtualAddress vaddr) const
     {
@@ -194,7 +194,7 @@ public:
 
     void set_inherit_mode(InheritMode inherit_mode) { m_inherit_mode = inherit_mode; }
 
-    bool remap_page_range(size_t page_index, size_t page_count, bool do_commit);
+    bool remap_page_range(size_t page_index, size_t page_count);
 
     bool is_volatile(VirtualAddress vaddr, size_t size) const;
     enum class SetVolatileError {

+ 1 - 1
Kernel/VM/SharedInodeVMObject.cpp

@@ -41,7 +41,7 @@ NonnullRefPtr<SharedInodeVMObject> SharedInodeVMObject::create_with_inode(Inode&
     return vmobject;
 }
 
-NonnullRefPtr<VMObject> SharedInodeVMObject::clone()
+RefPtr<VMObject> SharedInodeVMObject::clone()
 {
     return adopt(*new SharedInodeVMObject(*this));
 }

+ 1 - 1
Kernel/VM/SharedInodeVMObject.h

@@ -37,7 +37,7 @@ class SharedInodeVMObject final : public InodeVMObject {
 
 public:
     static NonnullRefPtr<SharedInodeVMObject> create_with_inode(Inode&);
-    virtual NonnullRefPtr<VMObject> clone() override;
+    virtual RefPtr<VMObject> clone() override;
 
 private:
     virtual bool is_shared_inode() const override { return true; }

+ 1 - 1
Kernel/VM/VMObject.h

@@ -47,7 +47,7 @@ class VMObject : public RefCounted<VMObject>
 public:
     virtual ~VMObject();
 
-    virtual NonnullRefPtr<VMObject> clone() = 0;
+    virtual RefPtr<VMObject> clone() = 0;
 
     virtual bool is_anonymous() const { return false; }
     virtual bool is_purgeable() const { return false; }