aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteven Rostedt (Red Hat) <rostedt@goodmis.org>2014-10-28 16:38:11 -0400
committerSteven Rostedt <rostedt@goodmis.org>2014-10-28 16:38:11 -0400
commitc1a3bd301a943469f9b575edb26e195a9c987390 (patch)
tree83ce790eb3ffc8e73a45c9966317fbe09cfa7947
parent77014e82b50da13ab801c9cfeb5cb8c29881cc9c (diff)
parentb0807bc10a6ac95ab8bf3bbf57703a0f2edd9aa9 (diff)
Merge tag 'v3.12.30' into v3.12-rt
This is the 3.12.30 stable release Conflicts: fs/exec.c include/linux/radix-tree.h mm/filemap.c mm/page_alloc.c
-rw-r--r--Makefile2
-rw-r--r--arch/tile/mm/homecache.c2
-rw-r--r--arch/unicore32/include/asm/mmu_context.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/mm/pgtable.c21
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/buffer.c28
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/exec.c5
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/f2fs/checkpoint.c1
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/super.c16
-rw-r--r--include/linux/compaction.h20
-rw-r--r--include/linux/cpuset.h56
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h10
-rw-r--r--include/linux/jump_label.h20
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h11
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h233
-rw-r--r--include/linux/page-flags.h6
-rw-r--r--include/linux/pageblock-flags.h33
-rw-r--r--include/linux/pagemap.h131
-rw-r--r--include/linux/pagevec.h5
-rw-r--r--include/linux/plist.h45
-rw-r--r--include/linux/radix-tree.h5
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/shmem_fs.h1
-rw-r--r--include/linux/swap.h30
-rw-r--r--include/linux/swapfile.h2
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmacache.h38
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/trace/events/compaction.h67
-rw-r--r--include/trace/events/kmem.h10
-rw-r--r--include/trace/events/pagemap.h16
-rw-r--r--kernel/cpuset.c16
-rw-r--r--kernel/debug/debug_core.c14
-rw-r--r--kernel/fork.c7
-rw-r--r--lib/plist.c52
-rw-r--r--lib/radix-tree.c106
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c347
-rw-r--r--mm/filemap.c470
-rw-r--r--mm/fremap.c28
-rw-r--r--mm/frontswap.c13
-rw-r--r--mm/huge_memory.c93
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h22
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c56
-rw-r--r--mm/mincore.c20
-rw-r--r--mm/mmap.c55
-rw-r--r--mm/nommu.c24
-rw-r--r--mm/page_alloc.c423
-rw-r--r--mm/readahead.c37
-rw-r--r--mm/shmem.c133
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slub.c16
-rw-r--r--mm/swap.c101
-rw-r--r--mm/swap_state.c65
-rw-r--r--mm/swapfile.c224
-rw-r--r--mm/truncate.c74
-rw-r--r--mm/vmacache.c114
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c144
-rw-r--r--mm/vmstat.c13
88 files changed, 2367 insertions, 1358 deletions
diff --git a/Makefile b/Makefile
index 67cec33d00c7..1ad1566225ca 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 3
PATCHLEVEL = 12
-SUBLEVEL = 29
+SUBLEVEL = 30
EXTRAVERSION =
NAME = One Giant Leap for Frogkind
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 004ba568d93f..33294fdc402e 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
if (put_page_testzero(page)) {
homecache_change_page_home(page, order, PAGE_HOME_HASH);
if (order == 0) {
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
} else {
init_page_count(page);
__free_pages(page, order);
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c658f7a..ef470a7a3d0f 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -14,6 +14,8 @@
#include <linux/compiler.h>
#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/io.h>
#include <asm/cacheflush.h>
@@ -73,7 +75,7 @@ do { \
else \
mm->mmap = NULL; \
rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
- mm->mmap_cache = NULL; \
+ vmacache_invalidate(mm); \
mm->map_count--; \
remove_vma(high_vma); \
} \
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e6d90babc245..04905bfc508b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
static inline void __flush_tlb_one(unsigned long addr)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
*/
static inline void __flush_tlb_up(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
}
static inline void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb_all();
}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2c3e4f..0e25a1bc5ab5 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfa537a03be1..5da29d04de2f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
preempt_disable();
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = ALIGN(start, HPAGE_SIZE);
- for (; addr < end; addr += HPAGE_SIZE) {
- pgd = pgd_offset(mm, addr);
- if (likely(!pgd_none(*pgd))) {
- pud = pud_offset(pgd, addr);
- if (likely(!pud_none(*pud))) {
- pmd = pmd_offset(pud, addr);
- if (likely(!pmd_none(*pmd)))
- if (pmd_large(*pmd))
- return addr;
- }
- }
- }
- return 0;
-}
-
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
preempt_disable();
if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
+
/* Assume all of TLB entries was occupied by this task */
- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
- if (has_large_page(mm, start, end)) {
- local_flush_tlb();
- goto flush_all;
- }
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6e9ff8fac75a..6357298932bf 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
- if (page) {
+ if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 594bbfd4996e..7015d9079bd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
{
unsigned long num_pages, i;
@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- mark_page_accessed(p);
+ if (p != accessed)
+ mark_page_accessed(p);
}
}
@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
+ mark_extent_buffer_accessed(eb, NULL);
return eb;
}
rcu_read_unlock();
@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
spin_unlock(&mapping->private_lock);
unlock_page(p);
page_cache_release(p);
- mark_extent_buffer_accessed(exists);
+ mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
WARN_ON(PageDirty(p));
- mark_page_accessed(p);
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -4549,7 +4550,7 @@ again:
}
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
- mark_extent_buffer_accessed(exists);
+ mark_extent_buffer_accessed(exists, NULL);
goto free_eb;
}
/* add one reference for the tree */
@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
+ mark_extent_buffer_accessed(eb, NULL);
return eb;
}
rcu_read_unlock();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72da4df53c9a..ad80dfa6cf91 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
- *
- * Disable pagefault to avoid recursive lock since
- * the pages are already locked
*/
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
- pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
- * clear it here
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- mark_page_accessed(pages[i]);
page_cache_release(pages[i]);
}
}
diff --git a/fs/buffer.c b/fs/buffer.c
index e0fa2a454986..4fc570d40425 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
- page = find_get_page(bd_mapping, index);
+ page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
@@ -1358,12 +1358,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
+ /* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
if (bh)
bh_lru_install(bh);
- }
- if (bh)
+ } else
touch_buffer(bh);
+
return bh;
}
EXPORT_SYMBOL(__find_get_block);
@@ -1475,16 +1476,27 @@ EXPORT_SYMBOL(set_bh_page);
/*
* Called when truncating a buffer on a page completely.
*/
+
+/* Bits that are cleared during an invalidate */
+#define BUFFER_FLAGS_DISCARD \
+ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
+ 1 << BH_Delay | 1 << BH_Unwritten)
+
static void discard_buffer(struct buffer_head * bh)
{
+ unsigned long b_state, b_state_old;
+
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
+ b_state = bh->b_state;
+ for (;;) {
+ b_state_old = cmpxchg(&bh->b_state, b_state,
+ (b_state & ~BUFFER_FLAGS_DISCARD));
+ if (b_state_old == b_state)
+ break;
+ b_state = b_state_old;
+ }
unlock_buffer(bh);
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49ff..2f6cfcaa55fd 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
struct page *page = NULL;
if (blocknr + i < devsize) {
- page = read_mapping_page_async(mapping, blocknr + i,
- NULL);
+ page = read_mapping_page(mapping, blocknr + i, NULL);
/* synchronous error? */
if (IS_ERR(page))
page = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index ff5fa50de3ca..1f526a2fc688 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
- struct mm_struct * old_mm, *active_mm;
+ struct mm_struct *old_mm, *active_mm;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
@@ -845,6 +846,8 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
+ tsk->mm->vmacache_seqnum = 0;
+ vmacache_flush(tsk);
preempt_enable_rt();
task_unlock(tsk);
arch_pick_mmap_layout(mm);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 242226a87be7..7620133f78bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
* would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
if (e4b.bd_buddy_page == NULL) {
/*
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
ext4_mb_put_buddy_page_lock(&e4b);
return ret;
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bb312201ca95..15a29af63e20 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -70,7 +70,6 @@ repeat:
goto repeat;
}
out:
- mark_page_accessed(page);
return page;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 51ef27894433..d0335bdb65b4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -970,7 +970,6 @@ repeat:
}
got_it:
BUG_ON(nid != nid_of_node(page));
- mark_page_accessed(page);
return page;
}
@@ -1026,7 +1025,6 @@ page_hit:
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- mark_page_accessed(page);
return page;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fa8cb4b7b8fe..fc8e4991736a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1613,7 +1613,7 @@ out_finish:
static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
{
- release_pages(req->pages, req->num_pages, 0);
+ release_pages(req->pages, req->num_pages, false);
}
static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4598345ab87d..d08c108065e1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
-
if (!tmp) {
unlock_page(page);
page_cache_release(page);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1253c2006029..f3aee0bbe886 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
kunmap_atomic(p);
- mark_page_accessed(page);
page_cache_release(page);
copied += amt;
index++;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf8..89afe3a8f626 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
yield();
}
} else {
- page = find_lock_page(mapping, index);
+ page = find_get_page_flags(mapping, index,
+ FGP_LOCK|FGP_ACCESSED);
if (!page)
return NULL;
}
@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
map_bh(bh, sdp->sd_vfs, blkno);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
return bh;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..a4a8ed56e438 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
int error;
int i;
+ if (!hugepages_supported()) {
+ pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
+ return -ENOTSUPP;
+ }
+
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index fe3c0527545f..91bf52d1a88c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
struct inode *inode = OFNI_EDONI_2SFFJ(f);
struct page *pg;
- pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
(void *)jffs2_do_readpage_unlock, inode);
if (IS_ERR(pg))
return (void *)pg;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf72972..fdb74cbb9e0c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (end != NFS_I(inode)->npages) {
rcu_read_lock();
- end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
+ end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a27e3fecefaf..250ed5b20c8f 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
if (page) {
set_page_dirty(page);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
}
ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ea4ba9daeb47..a0b2f345da2b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
do {
unlock_page(pages[--do_pages]);
- mark_page_accessed(pages[do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
if (unlikely(status))
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad4df869c907..7724fbdf443f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/*
* We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
+ * vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
diff --git a/fs/super.c b/fs/super.c
index d127de207376..fb68a4c90c98 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
- if (!grab_super_passive(sb))
- return 0;
-
+ /*
+ * Don't call grab_super_passive as it is a potential
+ * scalability bottleneck. The counts could get updated
+ * between super_cache_count and super_cache_scan anyway.
+ * Call to super_cache_count with shrinker_rwsem held
+ * ensures the safety of call to list_lru_count_node() and
+ * s_op->nr_cached_objects().
+ */
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb,
sc->nid);
@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sc->nid);
total_objects = vfs_pressure_ratio(total_objects);
- drop_super(sb);
return total_objects;
}
@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s)
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
cleancache_invalidate_fs(s);
- fs->kill_sb(s);
-
- /* caches are now gone, we can safely kill the shrinker now */
unregister_shrinker(&s->s_shrink);
+ fs->kill_sb(s);
put_filesystem(fs);
put_super(s);
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 091d72e70d8a..01e3132820da 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *mask,
- bool sync, bool *contended);
+ enum migrate_mode mode, bool *contended);
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return zone->compact_considered < defer_limit;
}
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+static inline void compaction_defer_reset(struct zone *zone, int order,
+ bool alloc_success)
+{
+ if (alloc_success) {
+ zone->compact_considered = 0;
+ zone->compact_defer_shift = 0;
+ }
+ if (order >= zone->compact_order_failed)
+ zone->compact_order_failed = order + 1;
+}
+
/* Returns true if restarting compaction after many failures */
static inline bool compaction_restarting(struct zone *zone, int order)
{
@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
#else
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended)
+ enum migrate_mode mode, bool *contended)
{
return COMPACT_CONTINUE;
}
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index cc1b01cf2035..a7ebb89ae9fb 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,10 +12,31 @@
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
+#include <linux/jump_label.h>
#ifdef CONFIG_CPUSETS
-extern int number_of_cpusets; /* How many cpusets are defined in system? */
+extern struct static_key cpusets_enabled_key;
+static inline bool cpusets_enabled(void)
+{
+ return static_key_false(&cpusets_enabled_key);
+}
+
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key) + 1;
+}
+
+static inline void cpuset_inc(void)
+{
+ static_key_slow_inc(&cpusets_enabled_key);
+}
+
+static inline void cpuset_dec(void)
+{
+ static_key_slow_dec(&cpusets_enabled_key);
+}
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- return number_of_cpusets <= 1 ||
+ return nr_cpusets() <= 1 ||
__cpuset_node_allowed_softwall(node, gfp_mask);
}
static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
- return number_of_cpusets <= 1 ||
+ return nr_cpusets() <= 1 ||
__cpuset_node_allowed_hardwall(node, gfp_mask);
}
@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void);
extern void cpuset_print_task_mems_allowed(struct task_struct *p);
/*
- * get_mems_allowed is required when making decisions involving mems_allowed
- * such as during page allocation. mems_allowed can be updated in parallel
- * and depending on the new value an operation can fail potentially causing
- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
- * prevents these artificial failures.
+ * read_mems_allowed_begin is required when making decisions involving
+ * mems_allowed such as during page allocation. mems_allowed can be updated in
+ * parallel and depending on the new value an operation can fail potentially
+ * causing process failure. A retry loop with read_mems_allowed_begin and
+ * read_mems_allowed_retry prevents these artificial failures.
*/
-static inline unsigned int get_mems_allowed(void)
+static inline unsigned int read_mems_allowed_begin(void)
{
return read_seqcount_begin(&current->mems_allowed_seq);
}
/*
- * If this returns false, the operation that took place after get_mems_allowed
- * may have failed. It is up to the caller to retry the operation if
+ * If this returns true, the operation that took place after
+ * read_mems_allowed_begin may have failed artificially due to a concurrent
+ * update of mems_allowed. It is up to the caller to retry the operation if
* appropriate.
*/
-static inline bool put_mems_allowed(unsigned int seq)
+static inline bool read_mems_allowed_retry(unsigned int seq)
{
- return !read_seqcount_retry(&current->mems_allowed_seq, seq);
+ return read_seqcount_retry(&current->mems_allowed_seq, seq);
}
static inline void set_mems_allowed(nodemask_t nodemask)
@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
#else /* !CONFIG_CPUSETS */
+static inline bool cpusets_enabled(void) { return false; }
+
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
{
}
-static inline unsigned int get_mems_allowed(void)
+static inline unsigned int read_mems_allowed_begin(void)
{
return 0;
}
-static inline bool put_mems_allowed(unsigned int seq)
+static inline bool read_mems_allowed_retry(unsigned int seq)
{
- return true;
+ return false;
}
#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 9b4dd491f7e8..fa7ac989ff56 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
-extern void free_hot_cold_page(struct page *page, int cold);
-extern void free_hot_cold_page_list(struct list_head *list, int cold);
+extern void free_hot_cold_page(struct page *page, bool cold);
+extern void free_hot_cold_page_list(struct list_head *list, bool cold);
extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a291552ab767..aac671be9581 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
#endif /* CONFIG_DEBUG_VM */
extern unsigned long transparent_hugepage_flags;
-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end);
extern int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5214ff63c351..511b1a0d6cc2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h)
#endif
}
+static inline bool hugepages_supported(void)
+{
+ /*
+ * Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ return HPAGE_SHIFT != 0;
+}
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 006627b7c81d..c5f7962a436e 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -63,6 +63,10 @@ struct static_key {
# include <asm/jump_label.h>
# define HAVE_JUMP_LABEL
+#else
+struct static_key {
+ atomic_t enabled;
+};
#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
enum jump_label_type {
@@ -73,6 +77,12 @@ enum jump_label_type {
struct module;
#include <linux/atomic.h>
+
+static inline int static_key_count(struct static_key *key)
+{
+ return atomic_read(&key->enabled);
+}
+
#ifdef HAVE_JUMP_LABEL
#define JUMP_LABEL_TRUE_BRANCH 1UL
@@ -123,24 +133,20 @@ extern void jump_label_apply_nops(struct module *mod);
#else /* !HAVE_JUMP_LABEL */
-struct static_key {
- atomic_t enabled;
-};
-
static __always_inline void jump_label_init(void)
{
}
static __always_inline bool static_key_false(struct static_key *key)
{
- if (unlikely(atomic_read(&key->enabled)) > 0)
+ if (unlikely(static_key_count(key) > 0))
return true;
return false;
}
static __always_inline bool static_key_true(struct static_key *key)
{
- if (likely(atomic_read(&key->enabled)) > 0)
+ if (likely(static_key_count(key) > 0))
return true;
return false;
}
@@ -180,7 +186,7 @@ static inline int jump_label_apply_nops(struct module *mod)
static inline bool static_key_enabled(struct static_key *key)
{
- return (atomic_read(&key->enabled) > 0);
+ return static_key_count(key) > 0;
}
#endif /* _LINUX_JUMP_LABEL_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ee8b14ae4f3f..449905ebcab3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
#include <linux/mempolicy.h>
#include <linux/migrate_mode.h>
-typedef struct page *new_page_t(struct page *, unsigned long private, int **);
+typedef struct page *new_page_t(struct page *page, unsigned long private,
+ int **reason);
+typedef void free_page_t(struct page *page, unsigned long private);
/*
* Return values from addresss_space_operations.migratepage():
@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l);
extern void putback_movable_pages(struct list_head *l);
extern int migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
-extern int migrate_pages(struct list_head *l, new_page_t x,
+extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
unsigned long private, enum migrate_mode mode, int reason);
extern int fail_migrate_page(struct address_space *,
@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
static inline void putback_lru_pages(struct list_head *l) {}
static inline void putback_movable_pages(struct list_head *l) {}
-static inline int migrate_pages(struct list_head *l, new_page_t x,
- unsigned long private, enum migrate_mode mode, int reason)
+static inline int migrate_pages(struct list_head *l, new_page_t new,
+ free_page_t free, unsigned long private, enum migrate_mode mode,
+ int reason)
{ return -ENOSYS; }
static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2acbab47d043..4fe2d5edb079 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
int shmem_zero_setup(struct vm_area_struct *);
+#ifdef CONFIG_SHMEM
+bool shmem_mapping(struct address_space *mapping);
+#else
+static inline bool shmem_mapping(struct address_space *mapping)
+{
+ return false;
+}
+#endif
extern int can_do_mlock(void);
extern int user_shm_lock(size_t, struct user_struct *);
@@ -1655,9 +1663,6 @@ void page_cache_async_readahead(struct address_space *mapping,
unsigned long size);
unsigned long max_sane_readahead(unsigned long nr);
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping,
- struct file *filp);
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d87823c34843..a9c83ee9d6c3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -329,9 +329,9 @@ struct mm_rss_stat {
struct kioctx_table;
struct mm_struct {
- struct vm_area_struct * mmap; /* list of VMAs */
+ struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
- struct vm_area_struct * mmap_cache; /* last find_vma result */
+ u32 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 56482904a676..450f19c5c865 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled;
#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
-static inline int get_pageblock_migratetype(struct page *page)
+#define get_pageblock_migratetype(page) \
+ get_pfnblock_flags_mask(page, page_to_pfn(page), \
+ PB_migrate_end, MIGRATETYPE_MASK)
+
+static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
- return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
+ return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
+ MIGRATETYPE_MASK);
}
struct free_area {
@@ -138,6 +143,7 @@ enum zone_stat_item {
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
+ NR_PAGES_SCANNED, /* pages scanned since last reclaim */
#ifdef CONFIG_NUMA
NUMA_HIT, /* allocated in intended node */
NUMA_MISS, /* allocated in non intended node */
@@ -316,19 +322,12 @@ enum zone_type {
#ifndef __GENERATING_BOUNDS_H
struct zone {
- /* Fields commonly accessed by the page allocator */
+ /* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];
/*
- * When free pages are below this point, additional steps are taken
- * when reading the number of free pages to avoid per-cpu counter
- * drift allowing watermarks to be breached
- */
- unsigned long percpu_drift_mark;
-
- /*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -336,40 +335,26 @@ struct zone {
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
- unsigned long lowmem_reserve[MAX_NR_ZONES];
-
- /*
- * This is a per-zone reserve of pages that should not be
- * considered dirtyable memory.
- */
- unsigned long dirty_balance_reserve;
+ long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
+#endif
+
/*
- * zone reclaim becomes active if more unmapped pages exist.
+ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+ * this zone's LRU. Maintained by the pageout code.
*/
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif
+ unsigned int inactive_ratio;
+
+ struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
+
/*
- * free areas of different sizes
+ * This is a per-zone reserve of pages that should not be
+ * considered dirtyable memory.
*/
- spinlock_t lock;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
- /* Set to true when the PG_migrate_skip bits should be cleared */
- bool compact_blockskip_flush;
-
- /* pfns where compaction scanners should start */
- unsigned long compact_cached_free_pfn;
- unsigned long compact_cached_migrate_pfn;
-#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
- /* see spanned/present_pages for more description */
- seqlock_t span_seqlock;
-#endif
- struct free_area free_area[MAX_ORDER];
+ unsigned long dirty_balance_reserve;
#ifndef CONFIG_SPARSEMEM
/*
@@ -379,71 +364,14 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_COMPACTION
- /*
- * On compaction failure, 1<<compact_defer_shift compactions
- * are skipped before trying again. The number attempted since
- * last failure is tracked with compact_considered.
- */
- unsigned int compact_considered;
- unsigned int compact_defer_shift;
- int compact_order_failed;
-#endif
-
- ZONE_PADDING(_pad1_)
-
- /* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct lruvec lruvec;
-
- unsigned long pages_scanned; /* since last reclaim */
- unsigned long flags; /* zone flags, see below */
-
- /* Zone statistics */
- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
-
- /*
- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
- * this zone's LRU. Maintained by the pageout code.
- */
- unsigned int inactive_ratio;
-
-
- ZONE_PADDING(_pad2_)
- /* Rarely used or read-mostly fields */
-
+#ifdef CONFIG_NUMA
/*
- * wait_table -- the array holding the hash table
- * wait_table_hash_nr_entries -- the size of the hash table array
- * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
- *
- * The purpose of all these is to keep track of the people
- * waiting for a page to become available and make them
- * runnable again when possible. The trouble is that this
- * consumes a lot of space, especially when so few things
- * wait on pages at a given time. So instead of using
- * per-page waitqueues, we use a waitqueue hash table.
- *
- * The bucket discipline is to sleep on the same queue when
- * colliding and wake all in that wait queue when removing.
- * When something wakes, it must check to be sure its page is
- * truly available, a la thundering herd. The cost of a
- * collision is great, but given the expected load of the
- * table, they should be so rare as to be outweighed by the
- * benefits from the saved space.
- *
- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
- * primary users of these fields, and in mm/page_alloc.c
- * free_area_init_core() performs the initialization of them.
+ * zone reclaim becomes active if more unmapped pages exist.
*/
- wait_queue_head_t * wait_table;
- unsigned long wait_table_hash_nr_entries;
- unsigned long wait_table_bits;
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
- /*
- * Discontig memory support fields.
- */
- struct pglist_data *zone_pgdat;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -489,14 +417,103 @@ struct zone {
* adjust_managed_page_count() should be used instead of directly
* touching zone->managed_pages and totalram_pages.
*/
+ unsigned long managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
- unsigned long managed_pages;
+
+ const char *name;
/*
- * rarely used fields:
+ * Number of MIGRATE_RESEVE page block. To maintain for just
+ * optimization. Protected by zone->lock.
*/
- const char *name;
+ int nr_migrate_reserve_block;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /* see spanned/present_pages for more description */
+ seqlock_t span_seqlock;
+#endif
+
+ /*
+ * wait_table -- the array holding the hash table
+ * wait_table_hash_nr_entries -- the size of the hash table array
+ * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
+ *
+ * The purpose of all these is to keep track of the people
+ * waiting for a page to become available and make them
+ * runnable again when possible. The trouble is that this
+ * consumes a lot of space, especially when so few things
+ * wait on pages at a given time. So instead of using
+ * per-page waitqueues, we use a waitqueue hash table.
+ *
+ * The bucket discipline is to sleep on the same queue when
+ * colliding and wake all in that wait queue when removing.
+ * When something wakes, it must check to be sure its page is
+ * truly available, a la thundering herd. The cost of a
+ * collision is great, but given the expected load of the
+ * table, they should be so rare as to be outweighed by the
+ * benefits from the saved space.
+ *
+ * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
+ * primary users of these fields, and in mm/page_alloc.c
+ * free_area_init_core() performs the initialization of them.
+ */
+ wait_queue_head_t *wait_table;
+ unsigned long wait_table_hash_nr_entries;
+ unsigned long wait_table_bits;
+
+ ZONE_PADDING(_pad1_)
+
+ /* Write-intensive fields used from the page allocator */
+ spinlock_t lock;
+
+ /* free areas of different sizes */
+ struct free_area free_area[MAX_ORDER];
+
+ /* zone flags, see below */
+ unsigned long flags;
+
+ ZONE_PADDING(_pad2_)
+
+ /* Write-intensive fields used by page reclaim */
+
+ /* Fields commonly accessed by the page reclaim scanner */
+ spinlock_t lru_lock;
+ struct lruvec lruvec;
+
+ /*
+ * When free pages are below this point, additional steps are taken
+ * when reading the number of free pages to avoid per-cpu counter
+ * drift allowing watermarks to be breached
+ */
+ unsigned long percpu_drift_mark;
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ /* pfn where compaction free scanner should start */
+ unsigned long compact_cached_free_pfn;
+ /* pfn where async and sync compaction migration scanner should start */
+ unsigned long compact_cached_migrate_pfn[2];
+#endif
+
+#ifdef CONFIG_COMPACTION
+ /*
+ * On compaction failure, 1<<compact_defer_shift compactions
+ * are skipped before trying again. The number attempted since
+ * last failure is tracked with compact_considered.
+ */
+ unsigned int compact_considered;
+ unsigned int compact_defer_shift;
+ int compact_order_failed;
+#endif
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ /* Set to true when the PG_migrate_skip bits should be cleared */
+ bool compact_blockskip_flush;
+#endif
+
+ ZONE_PADDING(_pad3_)
+ /* Zone statistics */
+ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
typedef enum {
@@ -512,6 +529,7 @@ typedef enum {
ZONE_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
+ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
} zone_flags_t;
static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
}
+static inline int zone_is_fair_depleted(const struct zone *zone)
+{
+ return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
+}
+
static inline int zone_is_oom_locked(const struct zone *zone)
{
return test_bit(ZONE_OOM_LOCKED, &zone->flags);
@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
extern struct mutex zonelists_mutex;
void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags);
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags);
+bool zone_watermark_ok(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags);
enum memmap_context {
MEMMAP_EARLY,
MEMMAP_HOTPLUG,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index dd7d45b5c496..2284ea62c6cc 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,6 +198,7 @@ struct page; /* forward declaration */
TESTPAGEFLAG(Locked, locked)
PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
+ __SETPAGEFLAG(Referenced, referenced)
PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
PAGEFLAG(SavePinned, savepinned); /* Xen */
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
+ __SETPAGEFLAG(SwapBacked, swapbacked)
__PAGEFLAG(SlobFree, slob_free)
@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
PAGEFLAG(MappedToDisk, mappedtodisk)
-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
+/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */
+PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
#ifdef CONFIG_HIGHMEM
/*
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index c08730c10c7a..2baeee12f48e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -65,33 +65,26 @@ extern int pageblock_order;
/* Forward declaration */
struct page;
-unsigned long get_pageblock_flags_mask(struct page *page,
+unsigned long get_pfnblock_flags_mask(struct page *page,
+ unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask);
-void set_pageblock_flags_mask(struct page *page,
+
+void set_pfnblock_flags_mask(struct page *page,
unsigned long flags,
+ unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask);
/* Declarations for getting and setting flags. See mm/page_alloc.c */
-static inline unsigned long get_pageblock_flags_group(struct page *page,
- int start_bitidx, int end_bitidx)
-{
- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
- unsigned long mask = (1 << nr_flag_bits) - 1;
-
- return get_pageblock_flags_mask(page, end_bitidx, mask);
-}
-
-static inline void set_pageblock_flags_group(struct page *page,
- unsigned long flags,
- int start_bitidx, int end_bitidx)
-{
- unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
- unsigned long mask = (1 << nr_flag_bits) - 1;
-
- set_pageblock_flags_mask(page, flags, end_bitidx, mask);
-}
+#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
+ get_pfnblock_flags_mask(page, page_to_pfn(page), \
+ end_bitidx, \
+ (1 << (end_bitidx - start_bitidx + 1)) - 1)
+#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
+ set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \
+ end_bitidx, \
+ (1 << (end_bitidx - start_bitidx + 1)) - 1)
#ifdef CONFIG_COMPACTION
#define get_pageblock_skip(page) \
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e3dea75a078b..d57a02a9747b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
#define page_cache_get(page) get_page(page)
#define page_cache_release(page) put_page(page)
-void release_pages(struct page **pages, int nr, int cold);
+void release_pages(struct page **pages, int nr, bool cold);
/*
* speculatively take a reference to a page.
@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x)
typedef int filler_t(void *, struct page *);
-extern struct page * find_get_page(struct address_space *mapping,
- pgoff_t index);
-extern struct page * find_lock_page(struct address_space *mapping,
- pgoff_t index);
-extern struct page * find_or_create_page(struct address_space *mapping,
- pgoff_t index, gfp_t gfp_mask);
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan);
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan);
+
+#define FGP_ACCESSED 0x00000001
+#define FGP_LOCK 0x00000002
+#define FGP_CREAT 0x00000004
+#define FGP_WRITE 0x00000008
+#define FGP_NOFS 0x00000010
+#define FGP_NOWAIT 0x00000020
+
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
+
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned with an increased refcount.
+ *
+ * Otherwise, %NULL is returned.
+ */
+static inline struct page *find_get_page(struct address_space *mapping,
+ pgoff_t offset)
+{
+ return pagecache_get_page(mapping, offset, 0, 0, 0);
+}
+
+static inline struct page *find_get_page_flags(struct address_space *mapping,
+ pgoff_t offset, int fgp_flags)
+{
+ return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
+}
+
+/**
+ * find_lock_page - locate, pin and lock a pagecache page
+ * pagecache_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
+ *
+ * Otherwise, %NULL is returned.
+ *
+ * find_lock_page() may sleep.
+ */
+static inline struct page *find_lock_page(struct address_space *mapping,
+ pgoff_t offset)
+{
+ return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
+}
+
+/**
+ * find_or_create_page - locate or add a pagecache page
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
+ *
+ * If the page is not present, a new page is allocated using @gfp_mask
+ * and added to the page cache and the VM's LRU list. The page is
+ * returned locked and with an increased refcount.
+ *
+ * On memory exhaustion, %NULL is returned.
+ *
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an
+ * atomic allocation!
+ */
+static inline struct page *find_or_create_page(struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ return pagecache_get_page(mapping, offset,
+ FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
+ gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
+}
+
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
+ * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed. This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ mapping_gfp_mask(mapping),
+ GFP_NOFS);
+}
+
+struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_entries, struct page **entries,
+ pgoff_t *indices);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}
-extern struct page * grab_cache_page_nowait(struct address_space *mapping,
- pgoff_t index);
-extern struct page * read_cache_page_async(struct address_space *mapping,
- pgoff_t index, filler_t *filler, void *data);
extern struct page * read_cache_page(struct address_space *mapping,
pgoff_t index, filler_t *filler, void *data);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping,
extern int read_cache_pages(struct address_space *mapping,
struct list_head *pages, filler_t *filler, void *data);
-static inline struct page *read_mapping_page_async(
- struct address_space *mapping,
- pgoff_t index, void *data)
-{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- return read_cache_page_async(mapping, index, filler, data);
-}
-
static inline struct page *read_mapping_page(struct address_space *mapping,
pgoff_t index, void *data)
{
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index e4dbfab37729..b45d391b4540 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,11 @@ struct pagevec {
void __pagevec_release(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec);
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_entries,
+ pgoff_t *indices);
+void pagevec_remove_exceptionals(struct pagevec *pvec);
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages);
unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb390bd29..8b6c970cff6c 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
}
/**
+ * PLIST_HEAD - declare and init plist_head
+ * @head: name for struct plist_head variable
+ */
+#define PLIST_HEAD(head) \
+ struct plist_head head = PLIST_HEAD_INIT(head)
+
+/**
* PLIST_NODE_INIT - static struct plist_node initializer
* @node: struct plist_node variable name
* @__prio: initial node priority
@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
extern void plist_add(struct plist_node *node, struct plist_head *head);
extern void plist_del(struct plist_node *node, struct plist_head *head);
+extern void plist_requeue(struct plist_node *node, struct plist_head *head);
+
/**
* plist_for_each - iterate over the plist
* @pos: the type * to use as a loop counter
@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, node_list)
/**
+ * plist_for_each_continue - continue iteration over the plist
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ *
+ * Continue to iterate over plist, continuing after the current position.
+ */
+#define plist_for_each_continue(pos, head) \
+ list_for_each_entry_continue(pos, &(head)->node_list, node_list)
+
+/**
* plist_for_each_safe - iterate safely over a plist of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, mem.node_list)
/**
+ * plist_for_each_entry_continue - continue iteration over list of given type
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ * @m: the name of the list_struct within the struct
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define plist_for_each_entry_continue(pos, head, m) \
+ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
+
+/**
* plist_for_each_entry_safe - iterate safely over list of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
#endif
/**
+ * plist_next - get the next entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_next(pos) \
+ list_next_entry(pos, node_list)
+
+/**
+ * plist_prev - get the prev entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_prev(pos) \
+ list_prev_entry(pos, node_list)
+
+/**
* plist_first - return the first node (and thus, highest priority)
* @head: the &struct plist_head pointer
*
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 5b6d5b2ec926..ab8263efa637 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
void ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items);
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
- unsigned long index, unsigned long max_scan);
-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
- unsigned long index, unsigned long max_scan);
#ifndef CONFIG_PREEMPT_RT_FULL
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 727259911ba9..59f54e167d15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,6 +65,10 @@ struct fs_struct;
struct perf_event_context;
struct blk_plug;
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
+
/*
* List of flags we want to share for kernel threads,
* if only because they are not used by them anyway.
@@ -1099,6 +1103,9 @@ struct task_struct {
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
+ /* per-thread vma caching */
+ u32 vmacache_seqnum;
+ struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 30aa0dc60d75..deb49609cd36 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
loff_t size, unsigned long flags);
extern int shmem_zero_setup(struct vm_area_struct *);
extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern bool shmem_mapping(struct address_space *mapping);
extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46ba0c6c219f..241bf0922770 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,9 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
+ struct plist_node list; /* entry in swap_active_head */
+ struct plist_node avail_list; /* entry in swap_avail_head */
signed char type; /* strange name for an index */
- signed char next; /* next type on the swap list */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +256,6 @@ struct swap_info_struct {
struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
};
-struct swap_list_t {
- int head; /* head of priority-ordered swapfile list */
- int next; /* swapfile to be used next */
-};
-
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void);
/* linux/mm/swap.c */
-extern void __lru_cache_add(struct page *);
extern void lru_cache_add(struct page *);
+extern void lru_cache_add_anon(struct page *page);
+extern void lru_cache_add_file(struct page *page);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *head);
extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
+extern void init_page_accessed(struct page *page);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
@@ -287,22 +285,6 @@ extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
-/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-static inline void lru_cache_add_anon(struct page *page)
-{
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
-static inline void lru_cache_add_file(struct page *page)
-{
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
#define free_page_and_swap_cache(page) \
page_cache_release(page)
#define free_pages_and_swap_cache(pages, nr) \
- release_pages((pages), (nr), 0);
+ release_pages((pages), (nr), false);
static inline void show_swap_cache_info(void)
{
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624e8c10..388293a91e8c 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
* want to expose them to the dozens of source files that include swap.h
*/
extern spinlock_t swap_lock;
-extern struct swap_list_t swap_list;
+extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index c557c6d096de..3a712e2e7d76 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
+#ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
-#endif
+#endif /* CONFIG_SMP */
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
+#endif /* CONFIG_DEBUG_TLBFLUSH */
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
new file mode 100644
index 000000000000..c3fa0fd43949
--- /dev/null
+++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
+#ifndef __LINUX_VMACACHE_H
+#define __LINUX_VMACACHE_H
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+/*
+ * Hash based on the page number. Provides a good hit rate for
+ * workloads with good locality and those with random accesses as well.
+ */
+#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
+
+static inline void vmacache_flush(struct task_struct *tsk)
+{
+ memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+}
+
+extern void vmacache_flush_all(struct mm_struct *mm);
+extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
+extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
+ unsigned long addr);
+
+#ifndef CONFIG_MMU
+extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
+#endif
+
+static inline void vmacache_invalidate(struct mm_struct *mm)
+{
+ mm->vmacache_seqnum++;
+
+ /* deal with overflows */
+ if (unlikely(mm->vmacache_seqnum == 0))
+ vmacache_flush_all(mm);
+}
+
+#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1ea2fd5cc7ee..fe3704379b6f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -87,6 +87,14 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_DEBUG_TLBFLUSH
+#define count_vm_tlb_event(x) count_vm_event(x)
+#define count_vm_tlb_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_tlb_event(x) do {} while (0)
+#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
+#endif
+
#define __count_zone_vm_events(item, zone, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
zone_idx(zone), delta)
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index fde1b3e94c7d..c6814b917bdf 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,6 +5,7 @@
#define _TRACE_COMPACTION_H
#include <linux/types.h>
+#include <linux/list.h>
#include <linux/tracepoint.h>
#include <trace/events/gfpflags.h>
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
TRACE_EVENT(mm_compaction_migratepages,
- TP_PROTO(unsigned long nr_migrated,
- unsigned long nr_failed),
+ TP_PROTO(unsigned long nr_all,
+ int migrate_rc,
+ struct list_head *migratepages),
- TP_ARGS(nr_migrated, nr_failed),
+ TP_ARGS(nr_all, migrate_rc, migratepages),
TP_STRUCT__entry(
__field(unsigned long, nr_migrated)
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
),
TP_fast_assign(
- __entry->nr_migrated = nr_migrated;
+ unsigned long nr_failed = 0;
+ struct list_head *page_lru;
+
+ /*
+ * migrate_pages() returns either a non-negative number
+ * with the number of pages that failed migration, or an
+ * error code, in which case we need to count the remaining
+ * pages manually
+ */
+ if (migrate_rc >= 0)
+ nr_failed = migrate_rc;
+ else
+ list_for_each(page_lru, migratepages)
+ nr_failed++;
+
+ __entry->nr_migrated = nr_all - nr_failed;
__entry->nr_failed = nr_failed;
),
@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages,
__entry->nr_failed)
);
+TRACE_EVENT(mm_compaction_begin,
+ TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
+ unsigned long free_start, unsigned long zone_end),
+
+ TP_ARGS(zone_start, migrate_start, free_start, zone_end),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, zone_start)
+ __field(unsigned long, migrate_start)
+ __field(unsigned long, free_start)
+ __field(unsigned long, zone_end)
+ ),
+
+ TP_fast_assign(
+ __entry->zone_start = zone_start;
+ __entry->migrate_start = migrate_start;
+ __entry->free_start = free_start;
+ __entry->zone_end = zone_end;
+ ),
+
+ TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
+ __entry->zone_start,
+ __entry->migrate_start,
+ __entry->free_start,
+ __entry->zone_end)
+);
+
+TRACE_EVENT(mm_compaction_end,
+ TP_PROTO(int status),
+
+ TP_ARGS(status),
+
+ TP_STRUCT__entry(
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->status = status;
+ ),
+
+ TP_printk("status=%d", __entry->status)
+);
#endif /* _TRACE_COMPACTION_H */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index d0c613476620..aece1346ceb7 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain,
TRACE_EVENT(mm_page_alloc_extfrag,
TP_PROTO(struct page *page,
- int alloc_order, int fallback_order,
- int alloc_migratetype, int fallback_migratetype,
- int change_ownership),
+ int alloc_order, int fallback_order,
+ int alloc_migratetype, int fallback_migratetype, int new_migratetype),
TP_ARGS(page,
alloc_order, fallback_order,
- alloc_migratetype, fallback_migratetype,
- change_ownership),
+ alloc_migratetype, fallback_migratetype, new_migratetype),
TP_STRUCT__entry(
__field( struct page *, page )
@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->fallback_order = fallback_order;
__entry->alloc_migratetype = alloc_migratetype;
__entry->fallback_migratetype = fallback_migratetype;
- __entry->change_ownership = change_ownership;
+ __entry->change_ownership = (new_migratetype == alloc_migratetype);
),
TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
index 1c9fabde69e4..ce0803b8d05f 100644
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion,
TP_PROTO(
struct page *page,
- unsigned long pfn,
- int lru,
- unsigned long flags
+ int lru
),
- TP_ARGS(page, pfn, lru, flags),
+ TP_ARGS(page, lru),
TP_STRUCT__entry(
__field(struct page *, page )
@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion,
TP_fast_assign(
__entry->page = page;
- __entry->pfn = pfn;
+ __entry->pfn = page_to_pfn(page);
__entry->lru = lru;
- __entry->flags = flags;
+ __entry->flags = trace_pagemap_flags(page);
),
/* Flag format is based on page-types.c formatting for pagemap */
@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion,
TRACE_EVENT(mm_lru_activate,
- TP_PROTO(struct page *page, unsigned long pfn),
+ TP_PROTO(struct page *page),
- TP_ARGS(page, pfn),
+ TP_ARGS(page),
TP_STRUCT__entry(
__field(struct page *, page )
@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate,
TP_fast_assign(
__entry->page = page;
- __entry->pfn = pfn;
+ __entry->pfn = page_to_pfn(page);
),
/* Flag format is based on page-types.c formatting for pagemap */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b29c52479a6..c8289138cad4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
- * get_mems_allowed(). If at least one node remains unchanged and
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
* tsk does not have a mempolicy, then an empty nodemask will not be
* possible when mems_allowed is larger than a word.
*/
@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
- number_of_cpusets++;
+ cpuset_inc();
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
@@ -2092,7 +2087,6 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..e911ec662d03 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
}
+
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 4fcf8239ed95..c3da0a224f34 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -376,7 +378,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
+ mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -908,6 +910,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
diff --git a/lib/plist.c b/lib/plist.c
index 1ebc95f7a46f..0f2084d30798 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
plist_check_head(head);
}
+/**
+ * plist_requeue - Requeue @node at end of same-prio entries.
+ *
+ * This is essentially an optimized plist_del() followed by
+ * plist_add(). It moves an entry already in the plist to
+ * after any other same-priority entries.
+ *
+ * @node: &struct plist_node pointer - entry to be moved
+ * @head: &struct plist_head pointer - list head
+ */
+void plist_requeue(struct plist_node *node, struct plist_head *head)
+{
+ struct plist_node *iter;
+ struct list_head *node_next = &head->node_list;
+
+ plist_check_head(head);
+ BUG_ON(plist_head_empty(head));
+ BUG_ON(plist_node_empty(node));
+
+ if (node == plist_last(head))
+ return;
+
+ iter = plist_next(node);
+
+ if (node->prio != iter->prio)
+ return;
+
+ plist_del(node, head);
+
+ plist_for_each_continue(iter, head) {
+ if (node->prio != iter->prio) {
+ node_next = &iter->node_list;
+ break;
+ }
+ }
+ list_add_tail(&node->node_list, node_next);
+
+ plist_check_head(head);
+}
+
#ifdef CONFIG_DEBUG_PI_LIST
#include <linux/sched.h>
#include <linux/module.h>
@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
BUG_ON(prio_pos->prio_list.next != &first->prio_list);
}
+static void __init plist_test_requeue(struct plist_node *node)
+{
+ plist_requeue(node, &test_head);
+
+ if (node != plist_last(&test_head))
+ BUG_ON(node->prio == plist_next(node)->prio);
+}
+
static int __init plist_test(void)
{
int nr_expect = 0, i, loop;
@@ -193,6 +241,10 @@ static int __init plist_test(void)
nr_expect--;
}
plist_test_check(nr_expect);
+ if (!plist_node_empty(test_node + i)) {
+ plist_test_requeue(test_node + i);
+ plist_test_check(nr_expect);
+ }
}
for (i = 0; i < ARRAY_SIZE(test_node); i++) {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e7b61e84a732..b7ab98195573 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -949,81 +949,6 @@ next:
}
EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
-
-/**
- * radix_tree_next_hole - find the next hole (not-present entry)
- * @root: tree root
- * @index: index key
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
- * indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >= max_scan'
- * will be true). In rare cases of index wrap-around, 0 will be returned.
- *
- * radix_tree_next_hole may be called under rcu_read_lock. However, like
- * radix_tree_gang_lookup, this will not atomically search a snapshot of
- * the tree at a single point in time. For example, if a hole is created
- * at index 5, then subsequently a hole is created at index 10,
- * radix_tree_next_hole covering both indexes may return 10 if called
- * under rcu_read_lock.
- */
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
- unsigned long index, unsigned long max_scan)
-{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- if (!radix_tree_lookup(root, index))
- break;
- index++;
- if (index == 0)
- break;
- }
-
- return index;
-}
-EXPORT_SYMBOL(radix_tree_next_hole);
-
-/**
- * radix_tree_prev_hole - find the prev hole (not-present entry)
- * @root: tree root
- * @index: index key
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0), index]
- * for the first hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >= max_scan'
- * will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
- *
- * radix_tree_next_hole may be called under rcu_read_lock. However, like
- * radix_tree_gang_lookup, this will not atomically search a snapshot of
- * the tree at a single point in time. For example, if a hole is created
- * at index 10, then subsequently a hole is created at index 5,
- * radix_tree_prev_hole covering both indexes may return 5 if called under
- * rcu_read_lock.
- */
-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
- unsigned long index, unsigned long max_scan)
-{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- if (!radix_tree_lookup(root, index))
- break;
- index--;
- if (index == ULONG_MAX)
- break;
- }
-
- return index;
-}
-EXPORT_SYMBOL(radix_tree_prev_hole);
-
/**
* radix_tree_gang_lookup - perform multiple lookup on a radix tree
* @root: radix tree root
@@ -1338,15 +1263,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
}
/**
- * radix_tree_delete - delete an item from a radix tree
+ * radix_tree_delete_item - delete an item from a radix tree
* @root: radix tree root
* @index: index key
+ * @item: expected item
*
- * Remove the item at @index from the radix tree rooted at @root.
+ * Remove @item at @index from the radix tree rooted at @root.
*
- * Returns the address of the deleted item, or NULL if it was not present.
+ * Returns the address of the deleted item, or NULL if it was not present
+ * or the entry at the given @index was not @item.
*/
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+void *radix_tree_delete_item(struct radix_tree_root *root,
+ unsigned long index, void *item)
{
struct radix_tree_node *node = NULL;
struct radix_tree_node *slot = NULL;
@@ -1381,6 +1309,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
if (slot == NULL)
goto out;
+ if (item && slot != item) {
+ slot = NULL;
+ goto out;
+ }
+
/*
* Clear all tags associated with the item to be deleted.
* This way of doing it would be inefficient, but seldom is any set.
@@ -1425,6 +1358,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
out:
return slot;
}
+EXPORT_SYMBOL(radix_tree_delete_item);
+
+/**
+ * radix_tree_delete - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Remove the item at @index from the radix tree rooted at @root.
+ *
+ * Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+ return radix_tree_delete_item(root, index, NULL);
+}
EXPORT_SYMBOL(radix_tree_delete);
/**
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..fb51bc61d80a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o \
+ compaction.o balloon_compaction.o vmacache.o \
interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 6441083e76d3..adb6d0560e96 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
- zone->compact_cached_migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn[0] = start_pfn;
+ zone->compact_cached_migrate_pfn[1] = start_pfn;
zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
*/
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ bool set_unsuitable, bool migrate_scanner)
{
struct zone *zone = cc->zone;
+ unsigned long pfn;
if (cc->ignore_skip_hint)
return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
if (!page)
return;
- if (!nr_isolated) {
- unsigned long pfn = page_to_pfn(page);
+ if (nr_isolated)
+ return;
+
+ /*
+ * Only skip pageblocks when all forms of compaction will be known to
+ * fail in the near future.
+ */
+ if (set_unsuitable)
set_pageblock_skip(page);
- /* Update where compaction should restart */
- if (migrate_scanner) {
- if (!cc->finished_update_migrate &&
- pfn > zone->compact_cached_migrate_pfn)
- zone->compact_cached_migrate_pfn = pfn;
- } else {
- if (!cc->finished_update_free &&
- pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ pfn = page_to_pfn(page);
+
+ /* Update where async and sync compaction should restart */
+ if (migrate_scanner) {
+ if (cc->finished_update_migrate)
+ return;
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+ } else {
+ if (cc->finished_update_free)
+ return;
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
}
#else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ bool set_unsuitable, bool migrate_scanner)
{
}
#endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
}
/* async aborts if taking too long or contended */
- if (!cc->sync) {
+ if (cc->mode == MIGRATE_ASYNC) {
cc->contended = true;
return false;
}
@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
-static inline bool compact_trylock_irqsave(spinlock_t *lock,
- unsigned long *flags, struct compact_control *cc)
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction or aborts async
+ * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * is used where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
{
- return compact_checklock_irqsave(lock, flags, false, cc);
+ /* async compaction aborts if contended */
+ if (need_resched()) {
+ if (cc->mode == MIGRATE_ASYNC) {
+ cc->contended = true;
+ return true;
+ }
+
+ cond_resched();
+ }
+
+ return false;
}
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_RESERVE)
- return false;
-
- if (is_migrate_isolate(migratetype))
- return false;
-
- /* If the page is a large free page, then allow migration */
+ /* If the page is a large free page, then disallow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
+ return false;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
struct page *cursor, *valid_page = NULL;
unsigned long flags;
bool locked = false;
+ bool checked_pageblock = false;
cursor = pfn_to_page(blockpfn);
@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
/* Recheck this is a suitable migration target under lock */
- if (!strict && !suitable_migration_target(page))
- break;
+ if (!strict && !checked_pageblock) {
+ /*
+ * We need to check suitability of pageblock only once
+ * and this isolate_freepages_block() is called with
+ * pageblock range, so just check once is sufficient.
+ */
+ checked_pageblock = true;
+ if (!suitable_migration_target(page))
+ break;
+ }
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -330,7 +362,8 @@ isolate_fail:
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
+ update_pageblock_skip(cc, valid_page, total_isolated, true,
+ false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ bool set_unsuitable = true;
+ const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+ ISOLATE_ASYNC_MIGRATE : 0) |
+ (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (!cc->sync)
+ if (cc->mode == MIGRATE_ASYNC)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
return 0;
}
+ if (compact_should_abort(cc))
+ return 0;
+
/* Time to isolate some pages for migration */
- cond_resched();
for (; low_pfn < end_pfn; low_pfn++) {
/* give a chance to irqs before checking need_resched() */
- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
if (should_release_lock(&zone->lru_lock)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
locked = false;
@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* If isolation recently failed, do not retry */
pageblock_nr = low_pfn >> pageblock_order;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ if (last_pageblock_nr != pageblock_nr) {
+ int mt;
+
+ last_pageblock_nr = pageblock_nr;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /*
+ * For async migration, also only scan in MOVABLE
+ * blocks. Async migration is optimistic to see if
+ * the minimum amount of work satisfies the allocation
+ */
+ mt = get_pageblock_migratetype(page);
+ if (cc->mode == MIGRATE_ASYNC &&
+ !migrate_async_suitable(mt)) {
+ set_unsuitable = false;
+ goto next_pageblock;
+ }
+ }
/* Skip if free */
if (PageBuddy(page))
continue;
/*
- * For async migration, also only scan in MOVABLE blocks. Async
- * migration is optimistic to see if the minimum amount of work
- * satisfies the allocation
- */
- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
- !migrate_async_suitable(get_pageblock_migratetype(page))) {
- cc->finished_update_migrate = true;
- goto next_pageblock;
- }
-
- /*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
- cc->finished_update_migrate = true;
- list_add(&page->lru, migratelist);
- cc->nr_migratepages++;
- nr_isolated++;
- goto check_compact_cluster;
+ goto isolate_success;
}
}
continue;
@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
+ /*
+ * Migration will fail if an anonymous page is pinned in memory,
+ * so avoid taking lru_lock and isolating it unnecessarily in an
+ * admittedly racy check.
+ */
+ if (!page_mapping(page) &&
+ page_count(page) > page_mapcount(page))
+ continue;
+
/* Check if it is ok to still hold the lock */
locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
locked, cc);
@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (!cc->sync)
- mode |= ISOLATE_ASYNC_MIGRATE;
-
- if (unevictable)
- mode |= ISOLATE_UNEVICTABLE;
-
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
- cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
+
+isolate_success:
+ cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
-check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -626,7 +670,6 @@ check_compact_cluster:
next_pageblock:
low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
@@ -634,9 +677,13 @@ next_pageblock:
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- /* Update the pageblock-skip if the whole pageblock was scanned */
+ /*
+ * Update the pageblock-skip information and cached scanner pfn,
+ * if the whole pageblock was scanned without isolating any page.
+ */
if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ update_pageblock_skip(cc, valid_page, nr_isolated,
+ set_unsuitable, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
+ unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long block_end_pfn; /* end of current pageblock */
+ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone,
* Initialise the free scanner. The starting point is where we last
* successfully isolated from, zone-cached value, or the end of the
* zone when isolating for the first time. We need this aligned to
- * the pageblock boundary, because we do pfn -= pageblock_nr_pages
- * in the for loop.
+ * the pageblock boundary, because we do
+ * block_start_pfn -= pageblock_nr_pages in the for loop.
+ * For ending point, take care when isolating in last pageblock of a
+ * a zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
- pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
+ zone_end_pfn(zone));
low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
- * Take care that if the migration scanner is at the end of the zone
- * that the free scanner does not accidentally move to the next zone
- * in the next isolation cycle.
- */
- high_pfn = min(low_pfn, pfn);
-
- z_end_pfn = zone_end_pfn(zone);
-
- /*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
- pfn -= pageblock_nr_pages) {
+ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ block_end_pfn = block_start_pfn,
+ block_start_pfn -= pageblock_nr_pages) {
unsigned long isolated;
- unsigned long end_pfn;
/*
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check if we need
- * to schedule.
+ * to schedule, or even abort async compaction.
*/
- cond_resched();
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+ && compact_should_abort(cc))
+ break;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(block_start_pfn))
continue;
/*
@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone,
* i.e. it's possible that all pages within a zones range of
* pages do not belong to a single zone.
*/
- page = pfn_to_page(pfn);
+ page = pfn_to_page(block_start_pfn);
if (page_zone(page) != zone)
continue;
@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Found a block suitable for isolating free pages from */
- isolated = 0;
+ cc->free_pfn = block_start_pfn;
+ isolated = isolate_freepages_block(cc, block_start_pfn,
+ block_end_pfn, freelist, false);
+ nr_freepages += isolated;
/*
- * Take care when isolating in last pageblock of a zone which
- * ends in the middle of a pageblock.
+ * Set a flag that we successfully isolated in this pageblock.
+ * In the next loop iteration, zone->compact_cached_free_pfn
+ * will not be updated and thus it will effectively contain the
+ * highest pageblock we isolated pages from.
*/
- end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
- isolated = isolate_freepages_block(cc, pfn, end_pfn,
- freelist, false);
- nr_freepages += isolated;
+ if (isolated)
+ cc->finished_update_free = true;
/*
- * Record the highest PFN we isolated pages from. When next
- * looking for free pages, the search will restart here as
- * page migration may have returned some pages to the allocator
+ * isolate_freepages_block() might have aborted due to async
+ * compaction being contended
*/
- if (isolated) {
- cc->finished_update_free = true;
- high_pfn = max(high_pfn, pfn);
- }
+ if (cc->contended)
+ break;
}
/* split_free_page does not map the pages */
@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone,
* If we crossed the migrate scanner, we want to keep it that way
* so that compact_finished() may detect this
*/
- if (pfn < low_pfn)
- cc->free_pfn = max(pfn, zone->zone_start_pfn);
- else
- cc->free_pfn = high_pfn;
+ if (block_start_pfn < low_pfn)
+ cc->free_pfn = cc->migrate_pfn;
+
cc->nr_freepages = nr_freepages;
}
@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /* Isolate free pages if necessary */
+ /*
+ * Isolate free pages if necessary, and if we are not aborting due to
+ * contention.
+ */
if (list_empty(&cc->freepages)) {
- isolate_freepages(cc->zone, cc);
+ if (!cc->contended)
+ isolate_freepages(cc->zone, cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage,
}
/*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist. All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
*/
-static void update_nr_listpages(struct compact_control *cc)
+static void compaction_free(struct page *page, unsigned long data)
{
- int nr_migratepages = 0;
- int nr_freepages = 0;
- struct page *page;
-
- list_for_each_entry(page, &cc->migratepages, lru)
- nr_migratepages++;
- list_for_each_entry(page, &cc->freepages, lru)
- nr_freepages++;
+ struct compact_control *cc = (struct compact_control *)data;
- cc->nr_migratepages = nr_migratepages;
- cc->nr_freepages = nr_freepages;
+ list_add(&page->lru, &cc->freepages);
+ cc->nr_freepages++;
}
/* possible outcome of isolate_migratepages */
@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone,
unsigned int order;
unsigned long watermark;
- if (fatal_signal_pending(current))
+ if (cc->contended || fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
+ /* Let the next compaction start anew. */
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
+ const bool sync = cc->mode != MIGRATE_ASYNC;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
*/
- cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
- unsigned long nr_migrate, nr_remaining;
int err;
switch (isolate_migratepages(zone, cc)) {
@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- nr_migrate = cc->nr_migratepages;
+ if (!cc->nr_migratepages)
+ continue;
+
err = migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
- update_nr_listpages(cc);
- nr_remaining = cc->nr_migratepages;
- trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
- nr_remaining);
+ trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+ &cc->migratepages);
- /* Release isolated pages not migrated */
+ /* All pages were either migrated or will be released */
+ cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
- cc->nr_migratepages = 0;
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
@@ -1035,12 +1084,13 @@ out:
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
+ trace_mm_compaction_end(ret);
+
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask,
- bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
{
unsigned long ret;
struct compact_control cc = {
@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .sync = sync,
+ .mode = mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that is true if compaction was aborted due to lock contention
* @page: Optionally capture a free page of the requested order during compaction
*
@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended)
+ enum migrate_mode mode, bool *contended)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync,
+ status = compact_zone_order(zone, order, gfp_mask, mode,
contended);
rc = max(status, rc);
@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
compact_zone(zone, cc);
if (cc->order > 0) {
- int ok = zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0);
- if (ok && cc->order >= zone->compact_order_failed)
- zone->compact_order_failed = cc->order + 1;
- /* Currently async compaction is never deferred. */
- else if (!ok && cc->sync)
- defer_compaction(zone, cc->order);
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
}
VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .sync = false,
+ .mode = MIGRATE_ASYNC,
};
if (!order)
@@ -1171,7 +1217,8 @@ static void compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .sync = true,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
};
__compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d2d39a5a508..b012daefc2d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
{
int ret = 0;
/* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ if (test_bit(AS_ENOSPC, &mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ if (test_bit(AS_EIO, &mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &mapping->flags))
ret = -EIO;
return ret;
}
@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
+static int page_cache_tree_insert(struct address_space *mapping,
+ struct page *page)
+{
+ void **slot;
+ int error;
+
+ slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
+ if (slot) {
+ void *p;
+
+ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ if (!radix_tree_exceptional_entry(p))
+ return -EEXIST;
+ radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
+ return 0;
+ }
+ error = radix_tree_insert(&mapping->page_tree, page->index, page);
+ if (!error)
+ mapping->nrpages++;
+ return error;
+}
+
/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
+ error = page_cache_tree_insert(mapping, page);
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
- } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
}
@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- if (TestClearPageReclaim(page))
+ /*
+ * TestClearPageReclaim could be used here but it is an atomic
+ * operation and overkill in this particular case. Failing to
+ * shuffle a page marked for immediate reclaim is too mild to
+ * justify taking an atomic operation penalty at the end of
+ * ever page writeback.
+ */
+ if (PageReclaim(page)) {
+ ClearPageReclaim(page);
rotate_reclaimable_page(page);
+ }
if (!test_clear_page_writeback(page))
BUG();
@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
}
/**
- * find_get_page - find and get a page reference
+ * page_cache_next_hole - find the next hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
+ * lowest indexed hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'return - index >=
+ * max_scan' will be true). In rare cases of index wrap-around, 0 will
+ * be returned.
+ *
+ * page_cache_next_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 5, then subsequently a hole is created at
+ * index 10, page_cache_next_hole covering both indexes may return 10
+ * if called under rcu_read_lock.
+ */
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index++;
+ if (index == 0)
+ break;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL(page_cache_next_hole);
+
+/**
+ * page_cache_prev_hole - find the prev hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search backwards in the range [max(index-max_scan+1, 0), index] for
+ * the first hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'index - return >=
+ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
+ * will be returned.
+ *
+ * page_cache_prev_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 10, then subsequently a hole is created at
+ * index 5, page_cache_prev_hole covering both indexes may return 5 if
+ * called under rcu_read_lock.
+ */
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index--;
+ if (index == ULONG_MAX)
+ break;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL(page_cache_prev_hole);
+
+/**
+ * find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned with an increased refcount.
*
- * Is there a pagecache struct page at the given (mapping, offset) tuple?
- * If yes, increment its refcount and return it; if no, return NULL.
+ * If the slot holds a shadow entry of a previously evicted page, it
+ * is returned.
+ *
+ * Otherwise, %NULL is returned.
*/
-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
struct page *page;
@@ -734,24 +854,30 @@ out:
return page;
}
-EXPORT_SYMBOL(find_get_page);
+EXPORT_SYMBOL(find_get_entry);
/**
- * find_lock_page - locate, pin and lock a pagecache page
+ * find_lock_entry - locate, pin and lock a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
*
- * Locates the desired pagecache page, locks it, increments its reference
- * count and returns its address.
+ * If the slot holds a shadow entry of a previously evicted page, it
+ * is returned.
*
- * Returns zero if the page was not present. find_lock_page() may sleep.
+ * Otherwise, %NULL is returned.
+ *
+ * find_lock_entry() may sleep.
*/
-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
repeat:
- page = find_get_page(mapping, offset);
+ page = find_get_entry(mapping, offset);
if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
@@ -764,44 +890,87 @@ repeat:
}
return page;
}
-EXPORT_SYMBOL(find_lock_page);
+EXPORT_SYMBOL(find_lock_entry);
/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
+ * pagecache_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @fgp_flags: PCG flags
+ * @gfp_mask: gfp mask to use if a page is to be allocated
+ *
+ * Looks up the page cache slot at @mapping & @offset.
+ *
+ * PCG flags modify how the page is returned
*
- * Locates a page in the pagecache. If the page is not present, a new page
- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
- * LRU list. The returned page is locked and has its reference count
- * incremented.
+ * FGP_ACCESSED: the page will be marked accessed
+ * FGP_LOCK: Page is return locked
+ * FGP_CREAT: If page is not present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, %NULL is returned.
*
- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
- * allocation!
+ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ * if the GFP flags specified for FGP_CREAT are atomic.
*
- * find_or_create_page() returns the desired page's address, or zero on
- * memory exhaustion.
+ * If there is a page cache page, it is returned with an increased refcount.
*/
-struct page *find_or_create_page(struct address_space *mapping,
- pgoff_t index, gfp_t gfp_mask)
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
{
struct page *page;
- int err;
+
repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
- page = __page_cache_alloc(gfp_mask);
+ page = find_get_entry(mapping, offset);
+ if (radix_tree_exceptional_entry(page))
+ page = NULL;
+ if (!page)
+ goto no_page;
+
+ if (fgp_flags & FGP_LOCK) {
+ if (fgp_flags & FGP_NOWAIT) {
+ if (!trylock_page(page)) {
+ page_cache_release(page);
+ return NULL;
+ }
+ } else {
+ lock_page(page);
+ }
+
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+
+ if (page && (fgp_flags & FGP_ACCESSED))
+ mark_page_accessed(page);
+
+no_page:
+ if (!page && (fgp_flags & FGP_CREAT)) {
+ int err;
+ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ cache_gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS) {
+ cache_gfp_mask &= ~__GFP_FS;
+ radix_gfp_mask &= ~__GFP_FS;
+ }
+
+ page = __page_cache_alloc(cache_gfp_mask);
if (!page)
return NULL;
- /*
- * We want a regular kernel memory (not highmem or DMA etc)
- * allocation for the radix tree nodes, but we need to honour
- * the context-specific requirements the caller has asked for.
- * GFP_RECLAIM_MASK collects those requirements.
- */
- err = add_to_page_cache_lru(page, mapping, index,
- (gfp_mask & GFP_RECLAIM_MASK));
+
+ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ fgp_flags |= FGP_LOCK;
+
+ /* Init accessed so avoit atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ init_page_accessed(page);
+
+ err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -809,9 +978,80 @@ repeat:
goto repeat;
}
}
+
return page;
}
-EXPORT_SYMBOL(find_or_create_page);
+EXPORT_SYMBOL(pagecache_get_page);
+
+/**
+ * find_get_entries - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page cache index
+ * @nr_entries: The maximum number of entries
+ * @entries: Where the resulting entries are placed
+ * @indices: The cache indices corresponding to the entries in @entries
+ *
+ * find_get_entries() will search for and return a group of up to
+ * @nr_entries entries in the mapping. The entries are placed at
+ * @entries. find_get_entries() takes a reference against any actual
+ * pages it returns.
+ *
+ * The search returns a group of mapping-contiguous page cache entries
+ * with ascending indexes. There may be holes in the indices due to
+ * not-present pages.
+ *
+ * Any shadow entries of evicted pages are included in the returned
+ * array.
+ *
+ * find_get_entries() returns the number of pages and shadow entries
+ * which were found.
+ */
+unsigned find_get_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
+{
+ void **slot;
+ unsigned int ret = 0;
+ struct radix_tree_iter iter;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * Otherwise, we must be storing a swap entry
+ * here as an exceptional entry: so return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = iter.index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
/**
* find_get_pages - gang pagecache lookup
@@ -1031,39 +1271,6 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page(), but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed. This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
-{
- struct page *page = find_get_page(mapping, index);
-
- if (page) {
- if (trylock_page(page))
- return page;
- page_cache_release(page);
- return NULL;
- }
- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
- page_cache_release(page);
- page = NULL;
- }
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_nowait);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
@@ -1823,6 +2042,8 @@ repeat:
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
}
}
return page;
@@ -1859,6 +2080,10 @@ retry:
if (err < 0) {
page_cache_release(page);
return ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
}
out:
mark_page_accessed(page);
@@ -1866,40 +2091,25 @@ out:
}
/**
- * read_cache_page_async - read into page cache, fill it if needed
+ * read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
* @data: first arg to filler(data, page) function, often left as NULL
*
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
* Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
+ * not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
*/
-struct page *read_cache_page_async(struct address_space *mapping,
+struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
}
-EXPORT_SYMBOL(read_cache_page_async);
-
-static struct page *wait_on_page_read(struct page *page)
-{
- if (!IS_ERR(page)) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- }
- return page;
-}
+EXPORT_SYMBOL(read_cache_page);
/**
* read_cache_page_gfp - read into page cache, using specified page allocation flags.
@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
{
filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+ return do_read_cache_page(mapping, index, filler, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);
-/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: first arg to filler(data, page) function, often left as NULL
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page then wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data)
-{
- return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
-}
-EXPORT_SYMBOL(read_cache_page);
-
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
char *kaddr;
size_t copied;
- BUG_ON(!pagefault_disabled());
kaddr = kmap_atomic(page);
if (likely(i->nr_segs == 1)) {
int left;
@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
{
const struct address_space_operations *aops = mapping->a_ops;
- mark_page_accessed(page);
return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);
@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
- int status;
- gfp_t gfp_mask;
struct page *page;
- gfp_t gfp_notmask = 0;
+ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
- gfp_mask = mapping_gfp_mask(mapping);
- if (mapping_cap_account_dirty(mapping))
- gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
- gfp_notmask = __GFP_FS;
-repeat:
- page = find_lock_page(mapping, index);
+ fgp_flags |= FGP_NOFS;
+
+ page = pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping),
+ GFP_KERNEL);
if (page)
- goto found;
+ wait_for_stable_page(page);
- page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
- if (!page)
- return NULL;
- status = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & ~gfp_notmask);
- if (unlikely(status)) {
- page_cache_release(page);
- if (status == -EEXIST)
- goto repeat;
- return NULL;
- }
-found:
- wait_for_stable_page(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2344,18 +2515,15 @@ again:
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
diff --git a/mm/fremap.c b/mm/fremap.c
index bbc4d660221a..34feba60a17e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -23,28 +23,44 @@
#include "internal.h"
+static int mm_counter(struct page *page)
+{
+ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
+}
+
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page;
+ swp_entry_t entry;
if (pte_present(pte)) {
- struct page *page;
-
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, mm_counter(page));
page_remove_rmap(page);
page_cache_release(page);
+ }
+ } else { /* zap_pte() is not called when pte_none() */
+ if (!pte_file(pte)) {
update_hiwater_rss(mm);
- dec_mm_counter(mm, MM_FILEPAGES);
+ entry = pte_to_swp_entry(pte);
+ if (non_swap_entry(entry)) {
+ if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+ dec_mm_counter(mm, mm_counter(page));
+ }
+ } else {
+ free_swap_and_cache(entry);
+ dec_mm_counter(mm, MM_SWAPENTS);
+ }
}
- } else {
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
}
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdcb3197..c30eec536f03 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
- int type;
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ plist_for_each_entry(si, &swap_active_head, list)
totalpages += atomic_read(&si->frontswap_pages);
- }
return totalpages;
}
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
- int type;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ plist_for_each_entry(si, &swap_active_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
- *swapid = type;
+ *swapid = si->type;
ret = 0;
break;
}
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list.head each time
+ * so restart scan from swap_active_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 389973fd6bb7..2ee53749eb48 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-#endif
-
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
+static int khugepaged_node_load[MAX_NUMNODES];
+
+static bool khugepaged_scan_abort(int nid)
+{
+ int i;
+
+ /*
+ * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * allocate memory locally.
+ */
+ if (!zone_reclaim_mode)
+ return false;
+
+ /* If there is a count for this node already, it must be acceptable */
+ if (khugepaged_node_load[nid])
+ return false;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (!khugepaged_node_load[i])
+ continue;
+ if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ return true;
+ }
+ return false;
+}
+
#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2231,9 +2274,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
- node, __GFP_OTHER_NODE);
-
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2249,6 +2291,17 @@ static struct page
return *hpage;
}
#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Chose the node of the first page. This could
- * be more sophisticated and look at more pages,
- * but isn't for now.
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
*/
- if (node == NUMA_NO_NODE)
- node = page_to_nid(page);
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node))
+ goto out_unmap;
+ khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
+ node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
+ }
out:
return ret;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f80b17106d24..c33d8a65298c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
goto err;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask(h), &mpol, &nodemask);
@@ -596,7 +596,7 @@ retry_cpuset:
}
mpol_cond_put(mpol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->max_huge_pages;
if (write && h->order >= MAX_ORDER)
@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->nr_overcommit_huge_pages;
if (write && h->order >= MAX_ORDER)
@@ -2192,6 +2198,8 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return;
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
int hugetlb_report_node_meminfo(int nid, char *buf)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void)
struct hstate *h;
int nid;
+ if (!hugepages_supported())
+ return;
+
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
diff --git a/mm/internal.h b/mm/internal.h
index fdddbc83ac5f..d610f7ce4e9c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
@@ -120,7 +135,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- bool sync; /* Synchronous migration */
+ enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
@@ -130,7 +145,10 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool contended; /* True if a lock was contended */
+ bool contended; /* True if a lock was contended, or
+ * need_resched() true during async
+ * compaction
+ */
};
unsigned long
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b323..a402f8fdc68e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
for (; start < end; start += PAGE_SIZE) {
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- page = find_get_page(mapping, index);
+ page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
page_cache_release(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6e3f9c39bc22..4ab233d4714a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_move(&hpage->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index f37b609503bb..4e1d48e8d900 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -878,7 +878,7 @@ out_set_pte:
return 0;
}
-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm,
pte_t entry;
spinlock_t *ptl;
- entry = *pte;
+ entry = ACCESS_ONCE(*pte);
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..d31730564617 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* alloc_migrate_target should be improooooved!!
* migrate_pages returns # of failed pages.
*/
- ret = migrate_pages(&source, alloc_migrate_target, 0,
+ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret)
putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0437f3595b32..cc61c7a7d6a1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest,
+ err = migrate_pages(&pagelist, new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_page,
+ nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp)
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
- * Must be protected by get_mems_allowed()
+ * Must be protected by read_mems_allowed_begin()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -2045,7 +2045,7 @@ retry_cpuset:
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2055,7 +2055,7 @@ retry_cpuset:
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
}
@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
pol = &default_policy;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/*
* No reference counting needed for current->mempolicy
@@ -2102,7 +2102,7 @@ retry_cpuset:
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index e3cf71dd1288..96d4d814ae2f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -867,8 +867,9 @@ out:
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -912,11 +913,18 @@ out:
page_is_file_cache(page));
putback_lru_page(page);
}
+
/*
- * Move the new page to the LRU. If migration was not successful
- * then this will free the page.
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, putback_lru_page() will drop the reference grabbed
+ * during isolation.
*/
- putback_lru_page(newpage);
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+ ClearPageSwapBacked(newpage);
+ put_new_page(newpage, private);
+ } else
+ putback_lru_page(newpage);
+
if (result) {
if (rc)
*result = rc;
@@ -945,8 +953,9 @@ out:
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
- unsigned long private, struct page *hpage,
- int force, enum migrate_mode mode)
+ free_page_t put_new_page, unsigned long private,
+ struct page *hpage, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
put_anon_vma(anon_vma);
- if (!rc)
+ if (rc == MIGRATEPAGE_SUCCESS)
hugetlb_cgroup_migrate(hpage, new_hpage);
unlock_page(hpage);
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- put_page(new_hpage);
+
+ /*
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, put_page() will drop the reference grabbed during
+ * isolation.
+ */
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ put_new_page(new_hpage, private);
+ else
+ put_page(new_hpage);
+
if (result) {
if (rc)
*result = rc;
@@ -1012,6 +1031,8 @@ out:
* @from: The list of pages to be migrated.
* @get_new_page: The function used to allocate free pages to be used
* as the target of the page migration.
+ * @put_new_page: The function used to free target pages if migration
+ * fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
@@ -1025,7 +1046,8 @@ out:
* Returns the number of pages that were not migrated, or an error code.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
- unsigned long private, enum migrate_mode mode, int reason)
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, mode);
+ put_new_page, private, page,
+ pass > 2, mode);
else
- rc = unmap_and_move(get_new_page, private,
- page, pass > 2, mode);
+ rc = unmap_and_move(get_new_page, put_new_page,
+ private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
@@ -1194,7 +1217,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node,
+ err = migrate_pages(&pagelist, new_page_node, NULL,
(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node)
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
if (nr_remaining) {
putback_lru_pages(&migratepages);
isolated = 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..06cb81005c77 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_page(mapping, pgoff);
#ifdef CONFIG_SWAP
- /* shmem/tmpfs may return swap: account for swapcache page too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
if (page) {
present = PageUptodate(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index af99b9ed2007..c1249cb7dc15 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..1221d2b66e97 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = NULL;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(curr->mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = mm->mmap_cache;
- if (vma && vma->vm_start == addr && vma->vm_end == end)
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32fbe4d7577c..b43aa10fd623 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -418,7 +418,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
return bad;
}
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
{
int i;
@@ -462,7 +463,7 @@ static inline void set_page_guard_flag(struct page *page) { }
static inline void clear_page_guard_flag(struct page *page) { }
#endif
-static inline void set_page_order(struct page *page, int order)
+static inline void set_page_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -513,21 +514,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
- int order)
+ unsigned int order)
{
if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
+
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
+
+ /*
+ * zone check is done late to avoid uselessly
+ * calculating zone/node ids for pages that could
+ * never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
return 0;
@@ -559,6 +570,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
*/
static inline void __free_one_page(struct page *page,
+ unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
@@ -575,7 +587,7 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(migratetype == -1);
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
@@ -663,10 +675,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct list_head *list)
{
int to_free = count;
+ unsigned long nr_scanned;
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
- zone->pages_scanned = 0;
+ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ if (nr_scanned)
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
while (!list_empty(list)) {
struct page *page = list_first_entry(list, struct page, lru);
@@ -677,7 +692,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, zone, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
if (likely(!is_migrate_isolate_page(page))) {
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -733,15 +748,20 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
}
}
-static void free_one_page(struct zone *zone, struct page *page, int order,
+static void free_one_page(struct zone *zone,
+ struct page *page, unsigned long pfn,
+ unsigned int order,
int migratetype)
{
+ unsigned long nr_scanned;
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
- zone->pages_scanned = 0;
+ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ if (nr_scanned)
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
- __free_one_page(page, zone, order, migratetype);
+ __free_one_page(page, pfn, zone, order, migratetype);
if (unlikely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -778,15 +798,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
+ unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order))
return;
+ migratetype = get_pfnblock_migratetype(page, pfn);
local_lock_irqsave(pa_lock, flags);
__count_vm_events(PGFREE, 1 << order);
- migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
- free_one_page(page_zone(page), page, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype);
local_unlock_irqrestore(pa_lock, flags);
}
@@ -906,7 +927,7 @@ static inline int check_new_page(struct page *page)
return 0;
}
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
{
int i;
@@ -955,6 +976,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
+ set_freepage_migratetype(page, migratetype);
return page;
}
@@ -1079,6 +1101,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
{
int current_order = page_order(page);
+ /*
+ * When borrowing from MIGRATE_CMA, we need to release the excess
+ * buddy pages to CMA itself. We also ensure the freepage_migratetype
+ * is set to CMA so it is returned to the correct freelist in case
+ * the page ends up being not actually allocated from the pcp lists.
+ */
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1110,16 +1138,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
- int current_order;
+ unsigned int current_order;
struct page *page;
int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1; current_order >= order;
- --current_order) {
+ for (current_order = MAX_ORDER-1;
+ current_order >= order && current_order <= MAX_ORDER-1;
+ --current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
@@ -1143,21 +1172,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
- /*
- * Borrow the excess buddy pages as well, irrespective
- * of whether we stole freepages, or took ownership of
- * the pageblock or not.
- *
- * Exception: When borrowing from MIGRATE_CMA, release
- * the excess buddy pages to CMA itself.
- */
expand(zone, page, order, current_order, area,
- is_migrate_cma(migratetype)
- ? migratetype : start_migratetype);
+ new_type);
+ /* The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages. This is OK as long as it does
+ * not differ for MIGRATE_CMA type.
+ */
+ set_freepage_migratetype(page, new_type);
- trace_mm_page_alloc_extfrag(page, order,
- current_order, start_migratetype, migratetype,
- new_type == start_migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype, new_type);
return page;
}
@@ -1203,9 +1228,9 @@ retry_reserve:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, int cold)
+ int migratetype, bool cold)
{
- int mt = migratetype, i;
+ int i;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -1222,18 +1247,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* merge IO requests if the physical pages are ordered
* properly.
*/
- if (likely(cold == 0))
+ if (likely(!cold))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- if (IS_ENABLED(CONFIG_CMA)) {
- mt = get_pageblock_migratetype(page);
- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
- mt = migratetype;
- }
- set_freepage_migratetype(page, mt);
list = &page->lru;
- if (is_migrate_cma(mt))
+ if (is_migrate_cma(get_freepage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1369,7 +1388,7 @@ void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn;
unsigned long flags;
- int order, t;
+ unsigned int order, t;
struct list_head *curr;
if (zone_is_empty(zone))
@@ -1401,19 +1420,20 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
- * cold == 1 ? free a cold page : free a hot page
+ * cold == true ? free a cold page : free a hot page
*/
-void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pages_prepare(page, 0))
return;
- migratetype = get_pageblock_migratetype(page);
+ migratetype = get_pfnblock_migratetype(page, pfn);
set_freepage_migratetype(page, migratetype);
local_lock_irqsave(pa_lock, flags);
__count_vm_event(PGFREE);
@@ -1427,17 +1447,17 @@ void free_hot_cold_page(struct page *page, int cold)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (cold)
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
- else
+ if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
+ else
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1457,7 +1477,7 @@ out:
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, int cold)
+void free_hot_cold_page_list(struct list_head *list, bool cold)
{
struct page *page, *next;
@@ -1569,12 +1589,12 @@ int split_free_page(struct page *page)
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
{
unsigned long flags;
struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
again:
if (likely(order == 0)) {
@@ -1620,11 +1640,14 @@ again:
goto failed;
}
__mod_zone_freepage_state(zone, -(1 << order),
- get_pageblock_migratetype(page));
+ get_freepage_migratetype(page));
spin_unlock(&zone->lock);
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+ !zone_is_fair_depleted(zone))
+ zone_set_flag(zone, ZONE_FAIR_DEPLETED);
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1721,12 +1744,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags, long free_pages)
+static bool __zone_watermark_ok(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags,
+ long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
long free_cma = 0;
@@ -1741,7 +1764,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
- if (free_pages - free_cma <= min + lowmem_reserve)
+ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1756,15 +1779,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
@@ -1906,7 +1929,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
- for_each_online_node(i)
+ for_each_node_state(i, N_MEMORY)
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);
else
@@ -1949,6 +1972,18 @@ static inline void init_zone_allows_reclaim(int nid)
}
#endif /* CONFIG_NUMA */
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+ struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+
+ do {
+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+ zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+ } while (zone++ != preferred_zone);
+}
+
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -1956,18 +1991,22 @@ static inline void init_zone_allows_reclaim(int nid)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
+ struct zone *preferred_zone, int classzone_idx, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
- int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE);
+ int nr_fair_skipped = 0;
+ bool zonelist_rescan;
- classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
+ zonelist_rescan = false;
+
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1979,12 +2018,10 @@ zonelist_scan:
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
- goto try_this_zone;
/*
* Distribute pages in proportion to the individual
* zone size to ensure fair page aging. The zone a
@@ -1993,9 +2030,11 @@ zonelist_scan:
*/
if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
+ break;
+ if (zone_is_fair_depleted(zone)) {
+ nr_fair_skipped++;
continue;
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
+ }
}
/*
* When allocating a page cache page for writing, we
@@ -2023,15 +2062,19 @@ zonelist_scan:
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
- goto this_zone_full;
+ if (consider_zone_dirty && !zone_dirty_ok(zone))
+ continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags)) {
int ret;
+ /* Checked here to keep the fast path fast */
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ goto try_this_zone;
+
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
@@ -2093,17 +2136,11 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA))
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
zlc_mark_zone_full(zonelist, z);
}
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
-
- if (page)
+ if (page) {
/*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
@@ -2112,8 +2149,37 @@ this_zone_full:
* for !PFMEMALLOC purposes.
*/
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+ return page;
+ }
- return page;
+ /*
+ * The first pass makes sure allocations are spread fairly within the
+ * local node. However, the local node might have free pages left
+ * after the fairness batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without fairness, and
+ * include remote zones now, before entering the slowpath and waking
+ * kswapd: prefer spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ alloc_flags &= ~ALLOC_FAIR;
+ if (nr_fair_skipped) {
+ zonelist_rescan = true;
+ reset_alloc_batches(preferred_zone);
+ }
+ if (nr_online_nodes > 1)
+ zonelist_rescan = true;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ zonelist_rescan = true;
+ }
+
+ if (zonelist_rescan)
+ goto zonelist_scan;
+
+ return NULL;
}
/*
@@ -2229,7 +2295,7 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
@@ -2247,7 +2313,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto out;
@@ -2282,7 +2348,7 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
+ int classzone_idx, int migratetype, enum migrate_mode mode,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
@@ -2296,7 +2362,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, sync_migration,
+ nodemask, mode,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
@@ -2310,13 +2376,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- preferred_zone->compact_considered = 0;
- preferred_zone->compact_defer_shift = 0;
- if (order >= preferred_zone->compact_order_failed)
- preferred_zone->compact_order_failed = order + 1;
+ compaction_defer_reset(preferred_zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2332,7 +2395,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
- if (sync_migration)
+ if (mode != MIGRATE_ASYNC)
defer_compaction(preferred_zone, order);
cond_resched();
@@ -2345,9 +2408,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
- bool *contended_compaction, bool *deferred_compaction,
- unsigned long *did_some_progress)
+ int classzone_idx, int migratetype,
+ enum migrate_mode mode, bool *contended_compaction,
+ bool *deferred_compaction, unsigned long *did_some_progress)
{
return NULL;
}
@@ -2386,7 +2449,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
@@ -2404,7 +2467,8 @@ retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx,
+ migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2427,14 +2491,14 @@ static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2443,28 +2507,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void reset_alloc_batches(struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
-{
- struct zoneref *z;
- struct zone *zone;
-
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- /*
- * Only reset the batches of zones that were actually
- * considered in the fairness pass, we don't want to
- * trash fairness information for zones that are not
- * actually part of this zonelist's round-robin cycle.
- */
- if (!zone_local(preferred_zone, zone))
- continue;
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- }
-}
-
static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist,
enum zone_type high_zoneidx,
@@ -2535,14 +2577,14 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- bool sync_migration = false;
+ enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
bool contended_compaction = false;
@@ -2584,15 +2626,19 @@ restart:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
- first_zones_zonelist(zonelist, high_zoneidx, NULL,
- &preferred_zone);
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ struct zoneref *preferred_zoneref;
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ NULL,
+ &preferred_zone);
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ }
rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2607,7 +2653,7 @@ rebalance:
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
@@ -2629,17 +2675,16 @@ rebalance:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
- sync_migration = true;
+ migration_mode = MIGRATE_SYNC_LIGHT;
/*
* If compaction is deferred for high-order allocations, it is because
@@ -2656,7 +2701,8 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ classzone_idx, migratetype,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -2675,7 +2721,7 @@ rebalance:
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
- migratetype);
+ classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2714,12 +2760,11 @@ rebalance:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
@@ -2745,11 +2790,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
+ struct zoneref *preferred_zoneref;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL;
+ int classzone_idx;
gfp_mask &= gfp_allowed_mask;
@@ -2776,42 +2823,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
return NULL;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx,
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
-retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (unlikely(!page)) {
/*
- * The first pass makes sure allocations are spread
- * fairly within the local node. However, the local
- * node might have free pages left after the fairness
- * batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without
- * fairness, and include remote zones now, before
- * entering the slowpath and waking kswapd: prefer
- * spilling to a remote zone over swapping locally.
- */
- if (alloc_flags & ALLOC_FAIR) {
- reset_alloc_batches(zonelist, high_zoneidx,
- preferred_zone);
- alloc_flags &= ~ALLOC_FAIR;
- goto retry;
- }
- /*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
@@ -2819,7 +2850,7 @@ retry:
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
}
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2831,7 +2862,7 @@ out:
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
@@ -2870,7 +2901,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
@@ -3099,9 +3130,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
goto out;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
out:
return ret;
}
@@ -3254,12 +3285,12 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
- zone->pages_scanned,
+ K(zone_page_state(zone, NR_PAGES_SCANNED)),
(!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->lowmem_reserve[i]);
+ printk(" %ld", zone->lowmem_reserve[i]);
printk("\n");
}
@@ -3999,6 +4030,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
+ int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -4020,6 +4052,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -4057,6 +4095,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
}
/*
@@ -4136,7 +4180,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
static void __meminit zone_init_free_lists(struct zone *zone)
{
- int order, t;
+ unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
@@ -4959,7 +5003,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
+ if (node_state(nid, N_MEMORY))
+ init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -5549,7 +5594,7 @@ static void calculate_totalreserve_pages(void)
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- unsigned long max = 0;
+ long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
@@ -5791,7 +5836,12 @@ module_init(init_per_zone_wmark_min)
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
@@ -6033,17 +6083,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
* @end_bitidx: The last bit of interest
* returns pageblock_bits flags
*/
-unsigned long get_pageblock_flags_mask(struct page *page,
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx, word_bitidx;
+ unsigned long bitidx, word_bitidx;
unsigned long word;
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
@@ -6055,25 +6104,25 @@ unsigned long get_pageblock_flags_mask(struct page *page,
}
/**
- * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @start_bitidx: The first bit of interest
* @end_bitidx: The last bit of interest
* @flags: The flags to set
*/
-void set_pageblock_flags_mask(struct page *page, unsigned long flags,
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx, word_bitidx;
+ unsigned long bitidx, word_bitidx;
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
@@ -6251,7 +6300,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- 0, MIGRATE_SYNC, MR_CMA);
+ NULL, 0, cc->mode, MR_CMA);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
@@ -6290,7 +6339,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .sync = true,
+ .mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -6445,7 +6494,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- int order, i;
+ unsigned int order, i;
unsigned long pfn;
unsigned long flags;
/* find the first valid pfn */
@@ -6497,7 +6546,7 @@ bool is_free_buddy_page(struct page *page)
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
- int order;
+ unsigned int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..0f35e983bffb 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
#include <linux/syscalls.h>
#include <linux/file.h>
+#include "internal.h"
+
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -149,8 +149,7 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
- if (page)
+ if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping);
@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret;
}
+#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
*/
unsigned long max_sane_readahead(unsigned long nr)
{
- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
-}
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- int actual;
-
- actual = __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-
- return actual;
+ return min(nr, MAX_READAHEAD);
}
/*
@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t head;
rcu_read_lock();
- head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+ head = page_cache_prev_hole(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
+ pgoff_t prev_offset;
/*
* start of file
@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
+ start = page_cache_next_hole(mapping, offset + 1, max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
+ * trivial case: (offset - prev_offset) == 1
+ * unaligned reads: (offset - prev_offset) == 0
*/
- if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 0da81aaeb4cc..ab05681f41cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
void **pslot;
- void *item = NULL;
+ void *item;
VM_BUG_ON(!expected);
+ VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (pslot)
- item = radix_tree_deref_slot_protected(pslot,
- &mapping->tree_lock);
+ if (!pslot)
+ return -ENOENT;
+ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- if (replacement)
- radix_tree_replace_slot(pslot, replacement);
- else
- radix_tree_delete(&mapping->page_tree, index);
+ radix_tree_replace_slot(pslot, replacement);
return 0;
}
@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
}
/*
- * Like find_get_pages, but collecting swap entries as well as pages.
- */
-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
- pgoff_t start, unsigned int nr_pages,
- struct page **pages, pgoff_t *indices)
-{
- void **slot;
- unsigned int ret = 0;
- struct radix_tree_iter iter;
-
- if (!nr_pages)
- return 0;
-
- rcu_read_lock();
-restart:
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
- /*
- * Otherwise, we must be storing a swap entry
- * here as an exceptional entry: so return it
- * without attempting to raise page count.
- */
- goto export;
- }
- if (!page_cache_get_speculative(page))
- goto repeat;
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- page_cache_release(page);
- goto repeat;
- }
-export:
- indices[ret] = iter.index;
- pages[ret] = page;
- if (++ret == nr_pages)
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Remove swap entry from radix tree, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
- int error;
+ void *old;
spin_lock_irq(&mapping->tree_lock);
- error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
spin_unlock_irq(&mapping->tree_lock);
- if (!error)
- free_swap_and_cache(radix_to_swp_entry(radswap));
- return error;
-}
-
-/*
- * Pagevec may contain swap entries, so shuffle up pages before releasing.
- */
-static void shmem_deswap_pagevec(struct pagevec *pvec)
-{
- int i, j;
-
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
- pvec->pages[j++] = page;
- }
- pvec->nr = j;
+ if (old != radswap)
+ return -ENOENT;
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return 0;
}
/*
@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
*/
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
if (!pvec.nr)
break;
index = indices[pvec.nr - 1] + 1;
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
check_move_unevictable_pages(pvec.pages, pvec.nr);
pagevec_release(&pvec);
cond_resched();
@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pagevec_init(&pvec, 0);
index = start;
while (index < end) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
if (!pvec.nr)
break;
mem_cgroup_uncharge_start();
@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
while (index < end) {
cond_resched();
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+
+ pvec.nr = find_get_entries(mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.pages, indices);
if (!pvec.nr) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
return -EFBIG;
repeat:
swap.val = 0;
- page = find_lock_page(mapping, index);
+ page = find_lock_entry(mapping, index);
if (radix_tree_exceptional_entry(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
@@ -1102,6 +1037,9 @@ repeat:
goto failed;
}
+ if (page && sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
/* fallocated page? */
if (page && !PageUptodate(page)) {
if (sgp != SGP_READ)
@@ -1183,6 +1121,9 @@ repeat:
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
delete_from_swap_cache(page);
set_page_dirty(page);
swap_free(swap);
@@ -1207,8 +1148,11 @@ repeat:
goto decused;
}
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
__set_page_locked(page);
+ if (sgp == SGP_WRITE)
+ init_page_accessed(page);
+
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->backing_dev_info == &shmem_backing_dev_info;
+}
+
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pagevec_init(&pvec, 0);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr = find_get_entries(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
if (whence == SEEK_DATA)
@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
break;
}
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
pvec.nr = PAGEVEC_SIZE;
cond_resched();
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..eb4078c7d183 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
{
if (unlikely(pfmemalloc_active)) {
/* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
+ struct slab *slabp = virt_to_slab(objp);
+ struct page *page = virt_to_head_page(slabp->s_mem);
if (PageSlabPfmemalloc(page))
set_obj_pfmemalloc(&objp);
}
@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
__SetPageSlab(page + i);
if (page->pfmemalloc)
- SetPageSlabPfmemalloc(page + i);
+ SetPageSlabPfmemalloc(page);
}
memcg_bind_pages(cachep, cachep->gfporder);
@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
else
sub_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_freed);
+
+ __ClearPageSlabPfmemalloc(page);
while (i--) {
BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page++;
}
@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(slab_node(), flags);
retry:
@@ -3276,7 +3278,7 @@ retry:
}
}
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return obj;
}
diff --git a/mm/slub.c b/mm/slub.c
index a1646484f76e..90154580b304 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1664,7 +1664,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
return NULL;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1676,19 +1676,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
object = get_partial_node(s, n, c, flags);
if (object) {
/*
- * Return the object even if
- * put_mems_allowed indicated that
- * the cpuset mems_allowed was
- * updated in parallel. It's a
- * harmless race between the alloc
- * and the cpuset update.
+ * Don't check read_mems_allowed_retry()
+ * here - if mems_allowed was updated in
+ * parallel, that was a harmless race
+ * between allocation and the cpuset
+ * update
*/
- put_mems_allowed(cpuset_mems_cookie);
return object;
}
}
}
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
#endif
return NULL;
}
diff --git a/mm/swap.c b/mm/swap.c
index 05a695114c43..8ab73ba62a68 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -72,7 +72,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
}
static void __put_compound_page(struct page *page)
@@ -441,7 +441,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
SetPageActive(page);
lru += LRU_ACTIVE;
add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_activate(page, page_to_pfn(page));
+ trace_mm_lru_activate(page);
__count_vm_event(PGACTIVATE);
update_page_reclaim_stat(lruvec, file, 1);
@@ -554,12 +554,17 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
+ * still safe to use non-atomic ops
*/
-void __lru_cache_add(struct page *page)
+void init_page_accessed(struct page *page)
+{
+ if (!PageReferenced(page))
+ __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+
+static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
@@ -569,11 +574,34 @@ void __lru_cache_add(struct page *page)
pagevec_add(pvec, page);
put_locked_var(swapvec_lock, lru_add_pvec);
}
-EXPORT_SYMBOL(__lru_cache_add);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+
+void lru_cache_add_file(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
@@ -785,7 +813,7 @@ void lru_add_drain_all(void)
* grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
* will free it.
*/
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
{
int i;
LIST_HEAD(pages_to_free);
@@ -826,7 +854,7 @@ void release_pages(struct page **pages, int nr, int cold)
}
/* Clear Active bit in case of parallel mark_page_accessed */
- ClearPageActive(page);
+ __ClearPageActive(page);
list_add(&page->lru, &pages_to_free);
}
@@ -908,7 +936,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
update_page_reclaim_stat(lruvec, file, active);
- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+ trace_mm_lru_insertion(page, lru);
}
/*
@@ -922,6 +950,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_lru_add);
/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of entries
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_entries pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages,
+ pgoff_t *indices)
+{
+ pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
+}
+
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
+ */
+void pagevec_remove_exceptionals(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+}
+
+/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..4079edfff2cc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
return ret;
}
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, todo, 0);
+ release_pages(pagep, todo, false);
pagep += todo;
nr -= todo;
}
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), entry.val);
- if (page)
+ if (page) {
INC_CACHE_INFO(find_success);
+ if (TestClearPageReadahead(page))
+ atomic_inc(&swapin_readahead_hits);
+ }
INC_CACHE_INFO(find_total);
return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int pages, max_pages, last_ra;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << ACCESS_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ prev_offset = offset;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = atomic_read(&last_readahead_pages) / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- unsigned long offset = swp_offset(entry);
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
- unsigned long mask = (1UL << page_cluster) - 1;
+ unsigned long mask;
struct blk_plug plug;
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
+ if (offset != entry_offset)
+ SetPageReadahead(page);
page_cache_release(page);
}
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0ec2eaf3ccfd..660b9c0e2e40 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -591,6 +609,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -639,71 +660,65 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- int hp_index;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- hp_index = atomic_xchg(&highest_priority_index, -1);
- /*
- * highest_priority_index records current highest priority swap
- * type which just frees swap entries. If its priority is
- * higher than that of swap_list.next swap type, we use it. It
- * isn't protected by swap_lock, so it can be an invalid value
- * if the corresponding swap type is swapoff. We double check
- * the flags here. It's even possible the swap type is swapoff
- * and swapon again and its priority is changed. In such rare
- * case, low prority swap type might be used, but eventually
- * high priority swap will be used after several rounds of
- * swap.
- */
- if (hp_index != -1 && hp_index != type &&
- swap_info[type]->prio < swap_info[hp_index]->prio &&
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
- type = hp_index;
- swap_list.next = type;
- }
-
- si = swap_info[type];
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next]->prio)) {
- next = swap_list.head;
- wrapped++;
- }
+ spin_lock(&swap_avail_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
- if (!si->highest_bit) {
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&si->lock);
- continue;
+ goto nextsi;
}
- if (!(si->flags & SWP_WRITEOK)) {
- spin_unlock(&si->lock);
- continue;
- }
-
- swap_list.next = next;
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
- return swp_entry(type, offset);
- spin_lock(&swap_lock);
- next = swap_list.next;
+ return swp_entry(si->type, offset);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
+ */
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
+ spin_unlock(&swap_avail_lock);
+
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -765,27 +780,6 @@ out:
return NULL;
}
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
- int old_hp_index, new_hp_index;
-
- do {
- old_hp_index = atomic_read(&highest_priority_index);
- if (old_hp_index != -1 &&
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
- break;
- new_hp_index = type;
- } while (atomic_cmpxchg(&highest_priority_index,
- old_hp_index, new_hp_index) != old_hp_index);
-}
-
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
- set_highest_priority_index(p->type);
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- int i, prev;
-
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p->type;
- else
- swap_info[prev]->next = p->type;
+ assert_spin_locked(&swap_lock);
+ /*
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
+ */
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int i, type, prev;
- int err;
+ int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
- p = swap_info[type];
+ plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping)
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0)
- swap_list.head = p->next;
- else
- swap_info[prev]->next = p->next;
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i]->next)
- swap_info[i]->prio = p->prio--;
+ struct swap_info_struct *si = p;
+
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
+ }
least_priority++;
}
+ plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map_set(p, NULL);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(type);
+ frontswap_invalidate_area(p->type);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account informatin */
- swap_cgroup_swapoff(type);
+ swap_cgroup_swapoff(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
- p->next = -1;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index 353b683afd6e..2e84fe59190b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -22,6 +22,22 @@
#include <linux/cleancache.h>
#include "internal.h"
+static void clear_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ radix_tree_delete_item(&mapping->page_tree, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+}
/**
* do_invalidatepage - invalidate part or all of a page
@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
unsigned int partial_start; /* inclusive */
unsigned int partial_end; /* exclusive */
struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_init(&pvec, 0);
index = start;
- while (index < end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index >= end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+ if (!pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
if (index == start)
break;
index = start;
continue;
}
- if (index == start && pvec.pages[0]->index >= end) {
+ if (index == start && indices[0] >= end) {
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
break;
}
@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index >= end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
lock_page(page);
WARN_ON(page->index != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index = start;
unsigned long ret;
@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
*/
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index > end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
deactivate_page(page);
count += ret;
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index;
int i;
@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
cleancache_invalidate_inode(mapping);
pagevec_init(&pvec, 0);
index = start;
- while (index <= end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index > end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
lock_page(page);
WARN_ON(page->index != index);
if (page->mapping != mapping) {
@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
ret = ret2;
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..1037a3bab505
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+ struct task_struct *g, *p;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * Only flush the vmacache pointers as the
+ * mm seqnum is already set and curr's will
+ * be set upon invalidation when the next
+ * lookup is done.
+ */
+ if (mm == p->mm)
+ vmacache_flush(p);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache_seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (!vma)
+ continue;
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+ if (vma->vm_start <= addr && vma->vm_end > addr)
+ return vma;
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end)
+ return vma;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4283488e9b1c..83e33b8c2178 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2688,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
prev_end = VMALLOC_START;
- spin_lock(&vmap_area_lock);
+ rcu_read_lock();
if (list_empty(&vmap_area_list)) {
vmi->largest_chunk = VMALLOC_TOTAL;
goto out;
}
- list_for_each_entry(va, &vmap_area_list, list) {
+ list_for_each_entry_rcu(va, &vmap_area_list, list) {
unsigned long addr = va->va_start;
/*
@@ -2722,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
vmi->largest_chunk = VMALLOC_END - prev_end;
out:
- spin_unlock(&vmap_area_lock);
+ rcu_read_unlock();
}
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ad29b2925a0..5461d02ea718 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
bool zone_reclaimable(struct zone *zone)
{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+ return zone_page_state(zone, NR_PAGES_SCANNED) <
+ zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
- long max_pass;
+ long freeable;
long nr;
long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- if (max_pass == 0)
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0)
return 0;
/*
@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
+ delta *= freeable;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
- total_scan = max_pass;
+ total_scan = freeable;
}
/*
@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
* shrinkers to return -1 all the time. This results in a large
* nr being built up so when a shrink that can do some work
* comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
+ * freeable. This is bad for sustaining a working set in
* memory.
*
* Hence only allow the shrinker to scan the entire cache when
* a large delta change is calculated directly.
*/
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
+ freeable, delta, total_scan);
- while (total_scan >= batch_size) {
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
- shrinkctl->nr_to_scan = batch_size;
+ shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
cond_resched();
}
@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (!node_online(shrinkctl->nid))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
- (shrinkctl->nid != 0))
- break;
-
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
}
}
@@ -1089,7 +1108,7 @@ keep:
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
- free_hot_cold_page_list(&free_pages, 1);
+ free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ free_hot_cold_page_list(&page_list, true);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ free_hot_cold_page_list(&l_hold, true);
}
#ifdef CONFIG_SWAP
@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file, free;
+ unsigned long anon, file;
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
-
/*
* If it's foreseeable that reclaiming the file cache won't be
* enough to get the zone back into a desirable shape, we have
@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* thrashing - remaining file pages alone.
*/
if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
- if (unlikely(file + free <= high_wmark_pages(zone))) {
+ unsigned long zonefile;
+ unsigned long zonefree;
+
+ zonefree = zone_page_state(zone, NR_FREE_PAGES);
+ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
scan_balance = SCAN_ANON;
goto out;
}
@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
- bool scan_adjusted = false;
+ bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
continue;
/*
- * For global direct reclaim, reclaim only the number of pages
- * requested. Less care is taken to scan proportionally as it
- * is more important to minimise direct reclaim stall latency
- * than it is to properly age the LRU lists.
- */
- if (global_reclaim(sc) && !current_is_kswapd())
- break;
-
- /*
* For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs shrink
+ * requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
unsigned long lru_pages = 0;
nodes_clear(shrink->nodes_to_scan);
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index efea337fb16e..9f16fd3ee575 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
threshold = (*calculate_pressure)(zone);
- for_each_possible_cpu(cpu)
+ for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
}
@@ -767,6 +767,7 @@ const char * const vmstat_text[] = {
"nr_shmem",
"nr_dirtied",
"nr_written",
+ "nr_pages_scanned",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -857,12 +858,14 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
+#ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
-#endif
+#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
+#endif /* CONFIG_DEBUG_TLBFLUSH */
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
@@ -1059,7 +1062,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- zone->pages_scanned,
+ zone_page_state(zone, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
@@ -1069,10 +1072,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
seq_printf(m,
- "\n protection: (%lu",
+ "\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
seq_printf(m,
")"
"\n pagesets");