aboutsummaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c134
1 files changed, 97 insertions, 37 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf3636..9671f51e954d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -717,7 +718,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
- struct page *page, gfp_t gfp)
+ struct page *page, gfp_t gfp,
+ unsigned int flags)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
@@ -725,12 +727,16 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
- return VM_FAULT_OOM;
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
return VM_FAULT_OOM;
}
@@ -750,6 +756,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
+
+ /* Deliver the page fault to userland */
+ if (userfaultfd_missing(vma)) {
+ int ret;
+
+ spin_unlock(ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
+ pte_free(mm, pgtable);
+ ret = handle_userfault(vma, haddr, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
+ }
+
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +781,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes);
spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
}
return 0;
@@ -771,19 +793,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
- return true;
}
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ int ret;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ ret = 0;
+ set = false;
+ if (pmd_none(*pmd)) {
+ if (userfaultfd_missing(vma)) {
+ spin_unlock(ptl);
+ ret = handle_userfault(vma, haddr, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ } else {
+ set_huge_zero_page(pgtable, mm, vma,
+ haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ set = true;
+ }
+ } else
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
- return 0;
+ return ret;
}
gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +865,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- return 0;
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp, flags);
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +900,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_huge_zero_pmd(pmd)) {
struct page *zero_page;
- bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
zero_page = get_huge_zero_page();
- set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
- BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
}
@@ -1031,7 +1056,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1199,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1384,6 +1409,36 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pmd_t orig_pmd;
+
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ spin_unlock(ptl);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1396,12 +1451,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
/*
* For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_get_and_clear. So do the
+ * when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1514,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1560,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+ entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mkwrite(entry);
@@ -1599,6 +1654,11 @@ unlock:
return NULL;
}
+int pmd_freeable(pmd_t pmd)
+{
+ return !pmd_dirty(pmd);
+}
+
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
@@ -1710,7 +1770,7 @@ static void __split_huge_page_refcount(struct page *page,
*/
page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
+ BUG_ON(page_tail->mapping != TAIL_MAPPING);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
@@ -2138,7 +2198,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2499,7 +2560,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2591,7 +2652,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
@@ -2799,7 +2861,7 @@ static void khugepaged_do_scan(void)
cond_resched();
- if (unlikely(kthread_should_stop() || freezing(current)))
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
break;
spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2882,6 @@ static void khugepaged_do_scan(void)
static void khugepaged_wait_work(void)
{
- try_to_freeze();
-
if (khugepaged_has_work()) {
if (!khugepaged_scan_sleep_millisecs)
return;
@@ -2865,7 +2925,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);