aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAnders Roxell <anders.roxell@linaro.org>2014-07-08 14:40:42 +0200
committerAnders Roxell <anders.roxell@linaro.org>2014-07-08 14:54:45 +0200
commitd7789ba79b1d081932a2da6d05f78320a002505a (patch)
tree1462027856037610434f3911017a1d22113bf771 /mm
parent83f12c5cb20669c30ac711f28f09d7f464124d00 (diff)
parentbbae7add628cfe96a1facd578dd1eddcd1030de7 (diff)
Merge tag 'v3.14.10' into v3.14-rt
This is the 3.14.10 stable release Signed-off-by: Anders Roxell <anders.roxell@linaro.org> Conflicts: include/linux/irqdesc.h include/linux/thread_info.h kernel/irq/manage.c kernel/locking/rtmutex.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/compaction.c22
-rw-r--r--mm/huge_memory.c13
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/memory-failure.c90
-rw-r--r--mm/memory.c5
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c52
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/rmap.c11
-rw-r--r--mm/vmscan.c46
13 files changed, 195 insertions, 82 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 64007ac64526..fc9125819598 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -263,6 +263,9 @@ config MIGRATION
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ boolean
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/compaction.c b/mm/compaction.c
index 918577595ea8..5f702ef0a65f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -666,16 +666,20 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
+ unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
- * scanned from (or the end of the zone if starting). The low point
- * is the end of the pageblock the migration scanner is using.
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+ * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+ * in the for loop.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
*/
- pfn = cc->free_pfn;
+ pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
@@ -695,6 +699,7 @@ static void isolate_freepages(struct zone *zone,
for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ unsigned long end_pfn;
/*
* This can iterate a massively long zone without finding any
@@ -729,13 +734,10 @@ static void isolate_freepages(struct zone *zone,
isolated = 0;
/*
- * As pfn may not start aligned, pfn+pageblock_nr_page
- * may cross a MAX_ORDER_NR_PAGES boundary and miss
- * a pfn_valid check. Ensure isolate_freepages_block()
- * only scans within a pageblock
+ * Take care when isolating in last pageblock of a zone which
+ * ends in the middle of a pageblock.
*/
- end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
- end_pfn = min(end_pfn, z_end_pfn);
+ end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
isolated = isolate_freepages_block(cc, pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1546655a2d78..1c42d0c36d0b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1611,16 +1611,23 @@ pmd_t *page_check_address_pmd(struct page *page,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl)
{
+ pgd_t *pgd;
+ pud_t *pud;
pmd_t *pmd;
if (address & ~HPAGE_PMD_MASK)
return NULL;
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
return NULL;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+ pmd = pmd_offset(pud, address);
+
*ptl = pmd_lock(mm, pmd);
- if (pmd_none(*pmd))
+ if (!pmd_present(*pmd))
goto unlock;
if (pmd_page(*pmd) != page)
goto unlock;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2de3c845f03a..06a9bc0a3120 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1134,6 +1134,7 @@ static void return_unused_surplus_pages(struct hstate *h,
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
+ cond_resched_lock(&hugetlb_lock);
}
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 90002ea43638..33365e9ce6a7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -208,9 +208,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#endif
si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
- if ((flags & MF_ACTION_REQUIRED) && t == current) {
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, t);
+ ret = force_sig_info(SIGBUS, &si, current);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -384,20 +384,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ struct task_struct *t;
if (!tsk->mm)
- return 0;
- if (tsk->flags & PF_MCE_PROCESS)
- return !!(tsk->flags & PF_MCE_EARLY);
- return sysctl_memory_failure_early_kill;
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -412,16 +443,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -432,7 +464,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -442,10 +474,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
-
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
@@ -455,8 +487,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -469,7 +501,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -480,9 +513,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -967,7 +1000,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill);
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
@@ -1085,15 +1118,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "just unpoisoned", "filter hit", and
- * "race with other subpage."
+ * Check "filter hit" and "race with other subpage."
*/
lock_page(hpage);
- if (!PageHWPoison(hpage)
- || (hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- return 0;
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1156,6 +1190,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
diff --git a/mm/memory.c b/mm/memory.c
index eef1e5cfe8de..8c595234528a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1929,12 +1929,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags)
{
struct vm_area_struct *vma;
+ vm_flags_t vm_flags;
int ret;
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
+ vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+ if (!(vm_flags & vma->vm_flags))
+ return -EFAULT;
+
ret = handle_mm_fault(mm, vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ae3c8f3595d4..56224d998c39 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -526,9 +526,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
int nid;
struct page *page;
spinlock_t *ptl;
+ pte_t entry;
ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- page = pte_page(huge_ptep_get((pte_t *)pmd));
+ entry = huge_ptep_get((pte_t *)pmd);
+ if (!pte_present(entry))
+ goto unlock;
+ page = pte_page(entry);
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
goto unlock;
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3d..05f1180e9f21 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
- if (extent == HPAGE_PMD_SIZE)
+ if (extent == HPAGE_PMD_SIZE) {
+ VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ anon_vma_lock_write(vma->anon_vma);
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
+ if (need_rmap_locks)
+ anon_vma_unlock_write(vma->anon_vma);
+ }
if (err > 0) {
need_flush = true;
continue;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..d013dba21429 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -593,14 +593,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
* (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
* => fast response on large errors; small oscillation near setpoint
*/
-static inline long long pos_ratio_polynom(unsigned long setpoint,
+static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long dirty,
unsigned long limit)
{
long long pos_ratio;
long x;
- x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
limit - setpoint + 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -842,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
x_intercept = bdi_setpoint + span;
if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
x_intercept - bdi_setpoint + 1);
} else
pos_ratio /= 4;
@@ -2398,7 +2398,7 @@ int test_clear_page_writeback(struct page *page)
return ret;
}
-int test_set_page_writeback(struct page *page)
+int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret;
@@ -2423,9 +2423,10 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_TOWRITE);
+ if (!keep_write)
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
@@ -2436,7 +2437,7 @@ int test_set_page_writeback(struct page *page)
return ret;
}
-EXPORT_SYMBOL(test_set_page_writeback);
+EXPORT_SYMBOL(__test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marked with the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 563d41c4935b..672afc4c4157 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6064,53 +6064,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
* @end_bitidx: The last bit of interest
* returns pageblock_bits flags
*/
-unsigned long get_pageblock_flags_group(struct page *page,
- int start_bitidx, int end_bitidx)
+unsigned long get_pageblock_flags_mask(struct page *page,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long flags = 0;
- unsigned long value = 1;
+ unsigned long pfn, bitidx, word_bitidx;
+ unsigned long word;
zone = page_zone(page);
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (test_bit(bitidx + start_bitidx, bitmap))
- flags |= value;
-
- return flags;
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
/**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @start_bitidx: The first bit of interest
* @end_bitidx: The last bit of interest
* @flags: The flags to set
*/
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
- int start_bitidx, int end_bitidx)
+void set_pageblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long value = 1;
+ unsigned long pfn, bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (flags & value)
- __set_bit(bitidx + start_bitidx, bitmap);
- else
- __clear_bit(bitidx + start_bitidx, bitmap);
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = ACCESS_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
}
/*
diff --git a/mm/percpu.c b/mm/percpu.c
index 036cfe07050f..a2a54a85f691 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -612,7 +612,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
return NULL;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index d3cbac508c2f..cdbd31285cf6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
+ might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* above cannot corrupt).
*/
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
+ return NULL;
}
out:
rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto out;
+ return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
@@ -1554,10 +1556,9 @@ void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
+ anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
-
- anon_vma_free(anon_vma);
}
static struct anon_vma *rmap_walk_anon_lock(struct page *page,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a9c74b409681..6ef876cae8f1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2502,10 +2502,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2530,9 +2537,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2551,10 +2558,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -3285,7 +3316,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}