diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 134 |
1 files changed, 97 insertions, 37 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 078832cf3636..9671f51e954d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -717,7 +718,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -725,12 +727,16 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +756,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, haddr, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +781,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -771,19 +793,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; - if (!pmd_none(*pmd)) - return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); - return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, haddr, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +865,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - - count_vm_event(THP_FAULT_ALLOC); - return 0; + return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp, flags); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +900,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -1031,7 +1056,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1174,7 +1199,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1384,6 +1409,36 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr) + +{ + spinlock_t *ptl; + struct mm_struct *mm = tlb->mm; + int ret = 1; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pmd_t orig_pmd; + + orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd); + + /* No hugepage in swapcache */ + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + spin_unlock(ptl); + ret = 0; + } + + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1396,12 +1451,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; /* * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the + * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1459,7 +1514,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1505,7 +1560,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -1599,6 +1654,11 @@ unlock: return NULL; } +int pmd_freeable(pmd_t pmd) +{ + return !pmd_dirty(pmd); +} + static int __split_huge_page_splitting(struct page *page, struct vm_area_struct *vma, unsigned long address) @@ -1710,7 +1770,7 @@ static void __split_huge_page_refcount(struct page *page, */ page_tail->_mapcount = page->_mapcount; - BUG_ON(page_tail->mapping); + BUG_ON(page_tail->mapping != TAIL_MAPPING); page_tail->mapping = page->mapping; page_tail->index = page->index + i; @@ -2138,7 +2198,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2499,7 +2560,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2591,7 +2652,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; @@ -2799,7 +2861,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2820,8 +2882,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; @@ -2865,7 +2925,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); |