aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2018-09-11 14:12:08 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2018-09-11 14:12:08 +1000
commit2ba32c008bb7c184aebcf7cadbec8fa6ad189fb7 (patch)
treefbbec2d2f7e4e525a0efc62a6859326b8ebff79d
parenta03fdfa1177d53b927a4f810cae55178dab9ae55 (diff)
parentcc021c7b2954e853fb45634909be67524243ffd2 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi8
-rw-r--r--Documentation/x86/pat.txt4
-rw-r--r--arch/alpha/Kconfig2
-rw-r--r--arch/alpha/kernel/core_irongate.c4
-rw-r--r--arch/alpha/kernel/setup.c98
-rw-r--r--arch/alpha/mm/numa.c113
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm/include/asm/page.h2
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/hexagon/Kconfig3
-rw-r--r--arch/hexagon/mm/init.c20
-rw-r--r--arch/ia64/include/asm/pgtable.h1
-rw-r--r--arch/nios2/Kconfig3
-rw-r--r--arch/nios2/kernel/prom.c17
-rw-r--r--arch/nios2/kernel/setup.c39
-rw-r--r--arch/um/Kconfig2
-rw-r--r--arch/um/kernel/physmem.c22
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/mm/init.c54
-rw-r--r--arch/x86/entry/vdso/vma.c24
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c1
-rw-r--r--drivers/iommu/amd_iommu_v2.c1
-rw-r--r--drivers/iommu/intel-svm.c1
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c1
-rw-r--r--drivers/of/fdt.c11
-rw-r--r--drivers/xen/gntdev.c26
-rw-r--r--fs/bfs/inode.c9
-rw-r--r--fs/buffer.c50
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/hfs/brec.c5
-rw-r--r--fs/hfs/btree.c39
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c4
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c5
-rw-r--r--fs/hfsplus/btree.c42
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/extents.c4
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/ocfs2/buffer_head_io.c77
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlmglue.c28
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/move_extents.c57
-rw-r--r--fs/ocfs2/refcounttree.c16
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--include/linux/bitmap.h37
-rw-r--r--include/linux/hmm.h2
-rw-r--r--include/linux/hugetlb.h14
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/math64.h2
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/mmu_notifier.h27
-rw-r--r--include/linux/mmzone.h16
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/swap.h13
-rw-r--r--init/do_mounts.c31
-rw-r--r--kernel/fork.c55
-rw-r--r--kernel/pid.c2
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/bitmap.c22
-rw-r--r--lib/parser.c11
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/backing-dev.c35
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/hugetlb.c93
-rw-r--r--mm/kmemleak.c42
-rw-r--r--mm/list_lru.c7
-rw-r--r--mm/memblock.c81
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c126
-rw-r--r--mm/mempolicy.c24
-rw-r--r--mm/mincore.c12
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/page_alloc.c162
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/slub.c20
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/swapfile.c225
-rw-r--r--mm/vmscan.c6
-rw-r--r--mm/z3fold.c8
-rwxr-xr-xscripts/checkpatch.pl11
-rw-r--r--virt/kvm/kvm_main.c1
94 files changed, 1296 insertions, 901 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index d773d5697cf5..3187a18af6da 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -53,3 +53,11 @@ stable_pages_required (read-only)
If set, the backing device requires that all pages comprising a write
request must not be changed until writeout is complete.
+
+strictlimit (read-write)
+
+ Forces per-BDI checks for the share of given device in the write-back
+ cache even before the global background dirty limit is reached. This
+ is useful in situations where the global limit is much higher than
+ affordable for given relatively slow (or untrusted) device. Turning
+ strictlimit on has no visible effect if max_ratio is equal to 100%.
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index 2a4ee6302122..481d8d8536ac 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -90,12 +90,12 @@ pci proc | -- | -- | WC |
Advanced APIs for drivers
-------------------------
A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range,
-vm_insert_pfn
+vmf_insert_pfn
Drivers wanting to export some pages to userspace do it by using mmap
interface and a combination of
1) pgprot_noncached()
-2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn()
+2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn()
With PAT support, a new API pgprot_writecombine is being added. So, drivers can
continue to use the above sequence, with either pgprot_noncached() or
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5b4f88363453..620b0a711ee4 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -31,6 +31,8 @@ config ALPHA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
+ select HAVE_MEMBLOCK
+ select NO_BOOTMEM
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index aec757250e07..f70986683fc6 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <asm/ptrace.h>
#include <asm/cacheflush.h>
@@ -241,8 +242,7 @@ albacore_init_arch(void)
size / 1024);
}
#endif
- reserve_bootmem_node(NODE_DATA(0), pci_mem, memtop -
- pci_mem, BOOTMEM_DEFAULT);
+ memblock_reserve(pci_mem, memtop - pci_mem);
printk("irongate_init_arch: temporarily reserving "
"region %08lx-%08lx for PCI\n", pci_mem, memtop - 1);
}
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 5576f7646fb6..4f0d94471bc9 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -30,6 +30,7 @@
#include <linux/ioport.h>
#include <linux/platform_device.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/pci.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
@@ -312,9 +313,7 @@ setup_memory(void *kernel_end)
{
struct memclust_struct * cluster;
struct memdesc_struct * memdesc;
- unsigned long start_kernel_pfn, end_kernel_pfn;
- unsigned long bootmap_size, bootmap_pages, bootmap_start;
- unsigned long start, end;
+ unsigned long kernel_size;
unsigned long i;
/* Find free clusters, and init and free the bootmem accordingly. */
@@ -322,6 +321,8 @@ setup_memory(void *kernel_end)
(hwrpb->mddt_offset + (unsigned long) hwrpb);
for_each_mem_cluster(memdesc, cluster, i) {
+ unsigned long end;
+
printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n",
i, cluster->usage, cluster->start_pfn,
cluster->start_pfn + cluster->numpages);
@@ -335,6 +336,9 @@ setup_memory(void *kernel_end)
end = cluster->start_pfn + cluster->numpages;
if (end > max_low_pfn)
max_low_pfn = end;
+
+ memblock_add(PFN_PHYS(cluster->start_pfn),
+ cluster->numpages << PAGE_SHIFT);
}
/*
@@ -363,87 +367,9 @@ setup_memory(void *kernel_end)
max_low_pfn = mem_size_limit;
}
- /* Find the bounds of kernel memory. */
- start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
- end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
- bootmap_start = -1;
-
- try_again:
- if (max_low_pfn <= end_kernel_pfn)
- panic("not enough memory to boot");
-
- /* We need to know how many physically contiguous pages
- we'll need for the bootmap. */
- bootmap_pages = bootmem_bootmap_pages(max_low_pfn);
-
- /* Now find a good region where to allocate the bootmap. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = start + cluster->numpages;
- if (start >= max_low_pfn)
- continue;
- if (end > max_low_pfn)
- end = max_low_pfn;
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn
- && end - end_kernel_pfn >= bootmap_pages) {
- bootmap_start = end_kernel_pfn;
- break;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (end - start >= bootmap_pages) {
- bootmap_start = start;
- break;
- }
- }
-
- if (bootmap_start == ~0UL) {
- max_low_pfn >>= 1;
- goto try_again;
- }
-
- /* Allocate the bootmap and mark the whole MM as reserved. */
- bootmap_size = init_bootmem(bootmap_start, max_low_pfn);
-
- /* Mark the free regions. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = cluster->start_pfn + cluster->numpages;
- if (start >= max_low_pfn)
- continue;
- if (end > max_low_pfn)
- end = max_low_pfn;
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn) {
- free_bootmem(PFN_PHYS(start),
- (PFN_PHYS(start_kernel_pfn)
- - PFN_PHYS(start)));
- printk("freeing pages %ld:%ld\n",
- start, start_kernel_pfn);
- start = end_kernel_pfn;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (start >= end)
- continue;
-
- free_bootmem(PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
- printk("freeing pages %ld:%ld\n", start, end);
- }
-
- /* Reserve the bootmap memory. */
- reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size,
- BOOTMEM_DEFAULT);
- printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+ /* Reserve the kernel memory. */
+ kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
+ memblock_reserve(KERNEL_START_PHYS, kernel_size);
#ifdef CONFIG_BLK_DEV_INITRD
initrd_start = INITRD_START;
@@ -459,8 +385,8 @@ setup_memory(void *kernel_end)
initrd_end,
phys_to_virt(PFN_PHYS(max_low_pfn)));
} else {
- reserve_bootmem(virt_to_phys((void *)initrd_start),
- INITRD_SIZE, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ INITRD_SIZE);
}
}
#endif /* CONFIG_BLK_DEV_INITRD */
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index a9e86475f169..26cd925d19b1 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/swap.h>
#include <linux/initrd.h>
#include <linux/pfn.h>
@@ -59,12 +60,10 @@ setup_memory_node(int nid, void *kernel_end)
struct memclust_struct * cluster;
struct memdesc_struct * memdesc;
unsigned long start_kernel_pfn, end_kernel_pfn;
- unsigned long bootmap_size, bootmap_pages, bootmap_start;
unsigned long start, end;
unsigned long node_pfn_start, node_pfn_end;
unsigned long node_min_pfn, node_max_pfn;
int i;
- unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
int show_init = 0;
/* Find the bounds of current node */
@@ -134,24 +133,14 @@ setup_memory_node(int nid, void *kernel_end)
/* Cute trick to make sure our local node data is on local memory */
node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
#endif
- /* Quasi-mark the pg_data_t as in-use */
- node_min_pfn += node_datasz;
- if (node_min_pfn >= node_max_pfn) {
- printk(" not enough mem to reserve NODE_DATA");
- return;
- }
- NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
-
printk(" Detected node memory: start %8lu, end %8lu\n",
node_min_pfn, node_max_pfn);
DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid));
- DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
/* Find the bounds of kernel memory. */
start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
- bootmap_start = -1;
if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
panic("kernel loaded out of ram");
@@ -161,89 +150,11 @@ setup_memory_node(int nid, void *kernel_end)
has much larger alignment than 8Mb, so it's safe. */
node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
- /* We need to know how many physically contiguous pages
- we'll need for the bootmap. */
- bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
-
- /* Now find a good region where to allocate the bootmap. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = start + cluster->numpages;
-
- if (start >= node_max_pfn || end <= node_min_pfn)
- continue;
-
- if (end > node_max_pfn)
- end = node_max_pfn;
- if (start < node_min_pfn)
- start = node_min_pfn;
-
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn
- && end - end_kernel_pfn >= bootmap_pages) {
- bootmap_start = end_kernel_pfn;
- break;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (end - start >= bootmap_pages) {
- bootmap_start = start;
- break;
- }
- }
-
- if (bootmap_start == -1)
- panic("couldn't find a contiguous place for the bootmap");
-
- /* Allocate the bootmap and mark the whole MM as reserved. */
- bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
- node_min_pfn, node_max_pfn);
- DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
- bootmap_start, bootmap_size, bootmap_pages);
+ memblock_add(PFN_PHYS(node_min_pfn),
+ (node_max_pfn - node_min_pfn) << PAGE_SHIFT);
- /* Mark the free regions. */
- for_each_mem_cluster(memdesc, cluster, i) {
- if (cluster->usage & 3)
- continue;
-
- start = cluster->start_pfn;
- end = cluster->start_pfn + cluster->numpages;
-
- if (start >= node_max_pfn || end <= node_min_pfn)
- continue;
-
- if (end > node_max_pfn)
- end = node_max_pfn;
- if (start < node_min_pfn)
- start = node_min_pfn;
-
- if (start < start_kernel_pfn) {
- if (end > end_kernel_pfn) {
- free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
- (PFN_PHYS(start_kernel_pfn)
- - PFN_PHYS(start)));
- printk(" freeing pages %ld:%ld\n",
- start, start_kernel_pfn);
- start = end_kernel_pfn;
- } else if (end > start_kernel_pfn)
- end = start_kernel_pfn;
- } else if (start < end_kernel_pfn)
- start = end_kernel_pfn;
- if (start >= end)
- continue;
-
- free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
- printk(" freeing pages %ld:%ld\n", start, end);
- }
-
- /* Reserve the bootmap memory. */
- reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start),
- bootmap_size, BOOTMEM_DEFAULT);
- printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+ NODE_DATA(nid)->node_start_pfn = node_min_pfn;
+ NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn;
node_set_online(nid);
}
@@ -251,6 +162,7 @@ setup_memory_node(int nid, void *kernel_end)
void __init
setup_memory(void *kernel_end)
{
+ unsigned long kernel_size;
int nid;
show_mem_layout();
@@ -262,6 +174,9 @@ setup_memory(void *kernel_end)
for (nid = 0; nid < MAX_NUMNODES; nid++)
setup_memory_node(nid, kernel_end);
+ kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS;
+ memblock_reserve(KERNEL_START_PHYS, kernel_size);
+
#ifdef CONFIG_BLK_DEV_INITRD
initrd_start = INITRD_START;
if (initrd_start) {
@@ -279,9 +194,8 @@ setup_memory(void *kernel_end)
phys_to_virt(PFN_PHYS(max_low_pfn)));
} else {
nid = kvaddr_to_nid(initrd_start);
- reserve_bootmem_node(NODE_DATA(nid),
- virt_to_phys((void *)initrd_start),
- INITRD_SIZE, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ INITRD_SIZE);
}
}
#endif /* CONFIG_BLK_DEV_INITRD */
@@ -303,9 +217,8 @@ void __init paging_init(void)
dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
for_each_online_node(nid) {
- bootmem_data_t *bdata = &bootmem_node_data[nid];
- unsigned long start_pfn = bdata->node_min_pfn;
- unsigned long end_pfn = bdata->node_low_pfn;
+ unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+ unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_present_pages;
if (dma_local_pfn >= end_pfn - start_pfn)
zones_size[ZONE_DMA] = end_pfn - start_pfn;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e8cd55a5b04c..ed74be437e32 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1635,6 +1635,10 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+config HAVE_MEMBLOCK_PFN_VALID
+ def_bool y
+ depends on HAVE_ARCH_PFN_VALID
+
config HAVE_GENERIC_GUP
def_bool y
depends on ARM_LPAE
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
#ifndef __ASSEMBLY__
+#include <linux/personality.h> /* For READ_IMPLIES_EXEC */
+
#ifndef CONFIG_MMU
#include <asm/page-nommu.h>
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1b1a0e95c751..c05ab9ec4f19 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -788,6 +788,10 @@ config ARCH_FLATMEM_ENABLE
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
+config HAVE_MEMBLOCK_PFN_VALID
+ def_bool y
+ depends on HAVE_ARCH_PFN_VALID
+
config HW_PERF_EVENTS
def_bool y
depends on ARM_PMU
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 6cee842a9b44..dc82567c57e0 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,6 +31,9 @@ config HEXAGON
select MODULES_USE_ELF_RELA
select GENERIC_CPU_DEVICES
select DMA_NONCOHERENT_OPS
+ select HAVE_MEMBLOCK
+ select ARCH_DISCARD_MEMBLOCK
+ select NO_BOOTMEM
---help---
Qualcomm Hexagon is a processor architecture designed for high
performance and low power across a wide variety of applications.
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1495d45e472d..d789b9cc0189 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <asm/atomic.h>
#include <linux/highmem.h>
#include <asm/tlb.h>
@@ -176,7 +177,6 @@ size_t hexagon_coherent_pool_size = (size_t) (DMA_RESERVE << 22);
void __init setup_arch_memory(void)
{
- int bootmap_size;
/* XXX Todo: this probably should be cleaned up */
u32 *segtable = (u32 *) &swapper_pg_dir[0];
u32 *segtable_end;
@@ -195,18 +195,22 @@ void __init setup_arch_memory(void)
bootmem_lastpg = PFN_DOWN((bootmem_lastpg << PAGE_SHIFT) &
~((BIG_KERNEL_PAGE_SIZE) - 1));
+ memblock_add(PHYS_OFFSET,
+ (bootmem_lastpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
+
+ /* Reserve kernel text/data/bss */
+ memblock_reserve(PHYS_OFFSET,
+ (bootmem_startpg - ARCH_PFN_OFFSET) << PAGE_SHIFT);
/*
* Reserve the top DMA_RESERVE bytes of RAM for DMA (uncached)
* memory allocation
*/
-
max_low_pfn = bootmem_lastpg - PFN_DOWN(DMA_RESERVED_BYTES);
min_low_pfn = ARCH_PFN_OFFSET;
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmem_startpg, min_low_pfn, max_low_pfn);
+ memblock_reserve(PFN_PHYS(max_low_pfn), DMA_RESERVED_BYTES);
printk(KERN_INFO "bootmem_startpg: 0x%08lx\n", bootmem_startpg);
printk(KERN_INFO "bootmem_lastpg: 0x%08lx\n", bootmem_lastpg);
- printk(KERN_INFO "bootmap_size: %d\n", bootmap_size);
printk(KERN_INFO "min_low_pfn: 0x%08lx\n", min_low_pfn);
printk(KERN_INFO "max_low_pfn: 0x%08lx\n", max_low_pfn);
@@ -257,14 +261,6 @@ void __init setup_arch_memory(void)
#endif
/*
- * Free all the memory that wasn't taken up by the bootmap, the DMA
- * reserve, or kernel itself.
- */
- free_bootmem(PFN_PHYS(bootmem_startpg) + bootmap_size,
- PFN_PHYS(bootmem_lastpg - bootmem_startpg) - bootmap_size -
- DMA_RESERVED_BYTES);
-
- /*
* The bootmem allocator seemingly just lives to feed memory
* to the paging system
*/
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 165827774bea..b1e7468eb65a 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -544,7 +544,6 @@ extern struct page *zero_page_memmap_ptr;
# ifdef CONFIG_VIRTUAL_MEM_MAP
/* arch mem_map init routine is needed due to holes in a virtual mem_map */
-# define __HAVE_ARCH_MEMMAP_INIT
extern void memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn);
# endif /* CONFIG_VIRTUAL_MEM_MAP */
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index f4ad1138e6b9..5ddf272b6c81 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -23,6 +23,9 @@ config NIOS2
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
+ select HAVE_MEMBLOCK
+ select ARCH_DISCARD_MEMBLOCK
+ select NO_BOOTMEM
config GENERIC_CSUM
def_bool y
diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c
index 8d7446a4b475..a6d4f7530247 100644
--- a/arch/nios2/kernel/prom.c
+++ b/arch/nios2/kernel/prom.c
@@ -32,23 +32,6 @@
#include <asm/sections.h>
-void __init early_init_dt_add_memory_arch(u64 base, u64 size)
-{
- u64 kernel_start = (u64)virt_to_phys(_text);
-
- if (!memory_size &&
- (kernel_start >= base) && (kernel_start < (base + size)))
- memory_size = size;
-
-}
-
-int __init early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
- bool nomap)
-{
- reserve_bootmem(base, size, BOOTMEM_DEFAULT);
- return 0;
-}
-
void __init early_init_devtree(void *params)
{
__be32 *dtb = (u32 *)__dtb_start;
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 926a02b17b31..2d0011ddd4d5 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -17,6 +17,7 @@
#include <linux/sched/task.h>
#include <linux/console.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/initrd.h>
#include <linux/of_fdt.h>
#include <linux/screen_info.h>
@@ -143,10 +144,12 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6,
void __init setup_arch(char **cmdline_p)
{
- int bootmap_size;
+ int dram_start;
console_verbose();
+ dram_start = memblock_start_of_DRAM();
+ memory_size = memblock_phys_mem_size();
memory_start = PAGE_ALIGN((unsigned long)__pa(_end));
memory_end = (unsigned long) CONFIG_NIOS2_MEM_BASE + memory_size;
@@ -163,39 +166,11 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = PFN_DOWN(memory_end);
max_mapnr = max_low_pfn;
- /*
- * give all the memory to the bootmap allocator, tell it to put the
- * boot mem_map at the start of memory
- */
- pr_debug("init_bootmem_node(?,%#lx, %#x, %#lx)\n",
- min_low_pfn, PFN_DOWN(PHYS_OFFSET), max_low_pfn);
- bootmap_size = init_bootmem_node(NODE_DATA(0),
- min_low_pfn, PFN_DOWN(PHYS_OFFSET),
- max_low_pfn);
-
- /*
- * free the usable memory, we have to make sure we do not free
- * the bootmem bitmap so we then reserve it after freeing it :-)
- */
- pr_debug("free_bootmem(%#lx, %#lx)\n",
- memory_start, memory_end - memory_start);
- free_bootmem(memory_start, memory_end - memory_start);
-
- /*
- * Reserve the bootmem bitmap itself as well. We do this in two
- * steps (first step was init_bootmem()) because this catches
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- *
- * Arguments are start, size
- */
- pr_debug("reserve_bootmem(%#lx, %#x)\n", memory_start, bootmap_size);
- reserve_bootmem(memory_start, bootmap_size, BOOTMEM_DEFAULT);
-
+ memblock_reserve(dram_start, memory_start - dram_start);
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
- reserve_bootmem(virt_to_phys((void *)initrd_start),
- initrd_end - initrd_start, BOOTMEM_DEFAULT);
+ memblock_reserve(virt_to_phys((void *)initrd_start),
+ initrd_end - initrd_start);
}
#endif /* CONFIG_BLK_DEV_INITRD */
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 6b9938919f0b..10c15b8853ae 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -12,6 +12,8 @@ config UML
select HAVE_UID16
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_DEBUG_KMEMLEAK
+ select HAVE_MEMBLOCK
+ select NO_BOOTMEM
select GENERIC_IRQ_SHOW
select GENERIC_CPU_DEVICES
select GENERIC_CLOCKEVENTS
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index f02596e9931d..296a91a04598 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
@@ -80,28 +81,23 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
unsigned long len, unsigned long long highmem)
{
unsigned long reserve = reserve_end - start;
- unsigned long pfn = PFN_UP(__pa(reserve_end));
- unsigned long delta = (len - reserve) >> PAGE_SHIFT;
- unsigned long offset, bootmap_size;
- long map_size;
+ long map_size = len - reserve;
int err;
- offset = uml_reserved - uml_physmem;
- map_size = len - offset;
if(map_size <= 0) {
os_warn("Too few physical memory! Needed=%lu, given=%lu\n",
- offset, len);
+ reserve, len);
exit(1);
}
physmem_fd = create_mem_file(len + highmem);
- err = os_map_memory((void *) uml_reserved, physmem_fd, offset,
+ err = os_map_memory((void *) reserve_end, physmem_fd, reserve,
map_size, 1, 1, 1);
if (err < 0) {
os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p "
"failed - errno = %d\n", map_size,
- (void *) uml_reserved, err);
+ (void *) reserve_end, err);
exit(1);
}
@@ -113,9 +109,11 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end,
os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE);
os_fsync_file(physmem_fd);
- bootmap_size = init_bootmem(pfn, pfn + delta);
- free_bootmem(__pa(reserve_end) + bootmap_size,
- len - bootmap_size - reserve);
+ memblock_add(__pa(start), len + highmem);
+ memblock_reserve(__pa(start), reserve);
+
+ min_low_pfn = PFN_UP(__pa(reserve_end));
+ max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT);
}
int phys_mapping(unsigned long phys, unsigned long long *offset_out)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 60eae744d8fd..6f38f7f28156 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -5,6 +5,7 @@ config UNICORE32
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_MEMBLOCK
+ select NO_BOOTMEM
select HAVE_GENERIC_DMA_COHERENT
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index f4950fbfe574..44ccc156f63c 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -84,58 +84,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low,
}
}
-static void __init uc32_bootmem_init(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct memblock_region *reg;
- unsigned int boot_pages;
- phys_addr_t bitmap;
- pg_data_t *pgdat;
-
- /*
- * Allocate the bootmem bitmap page. This must be in a region
- * of memory which has already been mapped.
- */
- boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
- __pfn_to_phys(end_pfn));
-
- /*
- * Initialise the bootmem allocator, handing the
- * memory banks over to bootmem.
- */
- node_set_online(0);
- pgdat = NODE_DATA(0);
- init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
-
- /* Free the lowmem regions from memblock into bootmem. */
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
- if (end >= end_pfn)
- end = end_pfn;
- if (start >= end)
- break;
-
- free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
- }
-
- /* Reserve the lowmem memblock reserved regions in bootmem. */
- for_each_memblock(reserved, reg) {
- unsigned long start = memblock_region_reserved_base_pfn(reg);
- unsigned long end = memblock_region_reserved_end_pfn(reg);
-
- if (end >= end_pfn)
- end = end_pfn;
- if (start >= end)
- break;
-
- reserve_bootmem(__pfn_to_phys(start),
- (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
- }
-}
-
static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low,
unsigned long max_high)
{
@@ -232,7 +180,7 @@ void __init bootmem_init(void)
find_limits(&min, &max_low, &max_high);
- uc32_bootmem_init(min, max_low);
+ node_set_online(0);
#ifdef CONFIG_SWIOTLB
swiotlb_init(1);
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556dbb12..d1daa53215a4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -39,7 +39,7 @@ void __init init_vdso_image(const struct vdso_image *image)
struct linux_binprm;
-static int vdso_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
@@ -84,12 +84,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
-static int vvar_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
long sym_offset;
- int ret = -EFAULT;
if (!image)
return VM_FAULT_SIGBUS;
@@ -108,29 +107,24 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- ret = vm_insert_pfn(vma, vmf->address,
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+ return vmf_insert_pfn(vma, vmf->address,
+ __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
pvclock_get_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
- ret = vm_insert_pfn_prot(
- vma,
- vmf->address,
- __pa(pvti) >> PAGE_SHIFT,
- pgprot_decrypted(vma->vm_page_prot));
+ return vmf_insert_pfn_prot(vma, vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
- ret = vm_insert_pfn(vma, vmf->address,
- vmalloc_to_pfn(tsc_pg));
+ return vmf_insert_pfn(vma, vmf->address,
+ vmalloc_to_pfn(tsc_pg));
}
- if (ret == 0 || ret == -EBUSY)
- return VM_FAULT_NOPAGE;
-
return VM_FAULT_SIGBUS;
}
diff --git a/block/genhd.c b/block/genhd.c
index 8cc719a37b32..434066b206a8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1003,7 +1003,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index e1c7996c018e..475b769e120c 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler,
static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = mmu_notifier_range_start,
};
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 58da65df03f5..fd552235bd13 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops iommu_mn = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = mn_release,
.clear_flush_young = mn_clear_flush_young,
.invalidate_range = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 4a03e5090952..db301efe126d 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops intel_mmuops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = intel_mm_release,
.change_pte = intel_change_pte,
.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index be28f05bfafa..03b49d52092e 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
static const struct mmu_notifier_ops gru_mmuops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end = gru_invalidate_range_end,
.release = gru_release,
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 800ad252cf9c..76c83c1ffeda 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1127,12 +1127,13 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
{
const u64 phys_offset = MIN_MEMBLOCK_ADDR;
+ if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
+ pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
+ base, base + size);
+ return;
+ }
+
if (!PAGE_ALIGNED(base)) {
- if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
- pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
- base, base + size);
- return;
- }
size -= PAGE_SIZE - (base & ~PAGE_MASK);
base = PAGE_ALIGN(base);
}
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 57390c7666e5..b0b02a501167 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -492,12 +492,19 @@ static bool in_range(struct gntdev_grant_map *map,
return true;
}
-static void unmap_if_in_range(struct gntdev_grant_map *map,
- unsigned long start, unsigned long end)
+static int unmap_if_in_range(struct gntdev_grant_map *map,
+ unsigned long start, unsigned long end,
+ bool blockable)
{
unsigned long mstart, mend;
int err;
+ if (!in_range(map, start, end))
+ return 0;
+
+ if (!blockable)
+ return -EAGAIN;
+
mstart = max(start, map->vma->vm_start);
mend = min(end, map->vma->vm_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
@@ -508,6 +515,8 @@ static void unmap_if_in_range(struct gntdev_grant_map *map,
(mstart - map->vma->vm_start) >> PAGE_SHIFT,
(mend - mstart) >> PAGE_SHIFT);
WARN_ON(err);
+
+ return 0;
}
static int mn_invl_range_start(struct mmu_notifier *mn,
@@ -519,25 +528,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn,
struct gntdev_grant_map *map;
int ret = 0;
- /* TODO do we really need a mutex here? */
if (blockable)
mutex_lock(&priv->lock);
else if (!mutex_trylock(&priv->lock))
return -EAGAIN;
list_for_each_entry(map, &priv->maps, next) {
- if (in_range(map, start, end)) {
- ret = -EAGAIN;
+ ret = unmap_if_in_range(map, start, end, blockable);
+ if (ret)
goto out_unlock;
- }
- unmap_if_in_range(map, start, end);
}
list_for_each_entry(map, &priv->freeable_maps, next) {
- if (in_range(map, start, end)) {
- ret = -EAGAIN;
+ ret = unmap_if_in_range(map, start, end, blockable);
+ if (ret)
goto out_unlock;
- }
- unmap_if_in_range(map, start, end);
}
out_unlock:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6e76e4e762e8..f7ef2913bd9d 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -351,7 +351,8 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
s->s_magic = BFS_MAGIC;
- if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+ if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) ||
+ le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) {
printf("Superblock is corrupted\n");
goto out1;
}
@@ -360,9 +361,11 @@ static int bfs_fill_super(struct super_block *s, void *data, size_t data_size,
sizeof(struct bfs_inode)
+ BFS_ROOT_INO - 1;
imap_len = (info->si_lasti / 8) + 1;
- info->si_imap = kzalloc(imap_len, GFP_KERNEL);
- if (!info->si_imap)
+ info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN);
+ if (!info->si_imap) {
+ printf("Cannot allocate %u bytes\n", imap_len);
goto out1;
+ }
for (i = 0; i < BFS_ROOT_INO; i++)
set_bit(i, info->si_imap);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6f1ae3ac9789..08c99fc69025 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -954,10 +954,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -977,6 +987,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1032,6 +1045,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1043,6 +1062,18 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3215,6 +3246,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3253,11 +3289,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3281,6 +3324,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 77d5cb62e76a..ae563d292942 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
int i;
vma->vm_flags |= VM_MIXEDMAP;
for (i = 0; i < pages && !ret; i++) {
+ vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ if (vmf & VM_FAULT_ERROR)
+ ret = vm_fault_to_errno(vmf, 0);
}
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9a8772465a90..896396554bcc 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -425,6 +425,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -441,6 +445,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 374b5688e29e..66c9e7f7170c 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
-
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- u32 count;
- int res;
+ struct inode *inode = tree->inode;
+ u32 count;
+ int res;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfs_extend_file(inode);
if (res)
- return ERR_PTR(res);
+ return res;
HFS_I(inode)->phys_size = inode->i_size =
(loff_t)HFS_I(inode)->alloc_blocks *
HFS_SB(tree->sb)->alloc_blksz;
@@ -249,6 +241,23 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
tree->free_nodes = count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index c8b252dbb26c..dcc2aab1b2c4 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -82,6 +82,7 @@ struct hfs_find_data {
extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
extern void hfs_btree_close(struct hfs_btree *);
extern void hfs_btree_write(struct hfs_btree *);
+extern int hfs_bmap_reserve(struct hfs_btree *, int);
extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
extern void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8a66405b0f8b..d365bf0b8c77 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
HFS_CDR_THD : HFS_CDR_FTH,
@@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
err = hfs_brec_find(&src_fd);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 5d0182654580..0c638c612152 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
} else {
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 2bab6b3cdba4..e6d554476db4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode,
if (err)
goto failed_init_create_attr;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1);
+ if (err)
+ goto failed_create_attr;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
@@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
if (err)
return err;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth);
+ if (err)
+ goto out;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index ed8eacb34452..1918544a7871 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -429,6 +429,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -445,6 +449,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index de14b2b6881b..d20902f3e015 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
+ struct inode *inode = tree->inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ u32 count;
+ int res;
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- u32 count;
- int res;
+ if (rsvd_nodes <= 0)
+ return 0;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
if (res)
- return ERR_PTR(res);
+ return res;
hip->phys_size = inode->i_size =
(loff_t)hip->alloc_blocks <<
HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
@@ -372,6 +367,23 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
tree->free_nodes = count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a196369ba779..35472cba750e 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
@@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2);
+ if (err)
+ goto out;
+
if (!str) {
int len;
@@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most twice.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
src_dir->i_ino, src_name);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 8e0f59767694..8a8893d522ef 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode,
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, hip->cached_extents,
sizeof(hfsplus_extent_rec));
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 8e039435958a..dd7ad9f13e3a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -311,6 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_btree_open hfsplus_btree_open
#define hfs_btree_close hfsplus_btree_close
#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_reserve hfsplus_bmap_reserve
#define hfs_bmap_alloc hfsplus_bmap_alloc
#define hfs_bmap_free hfsplus_bmap_free
#define hfs_bnode_read hfsplus_bnode_read
@@ -395,6 +396,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
void hfs_btree_close(struct hfs_btree *tree);
int hfs_btree_write(struct hfs_btree *tree);
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/iomap.c b/fs/iomap.c
index 74762b1ec233..9fb78e5090a1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1056,7 +1056,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
return length;
}
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d9ebe11c8990..9f0304f89fc3 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -99,25 +99,34 @@ out:
return ret;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[])
{
int status = 0;
unsigned int i;
struct buffer_head *bh;
+ int new_bh = 0;
trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
if (!nr)
goto bail;
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ break;
}
}
bh = bhs[i];
@@ -158,9 +167,26 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
submit_bh(REQ_OP_READ, 0, bh);
}
+read_failure:
for (i = nr; i > 0; i--) {
bh = bhs[i - 1];
+ if (unlikely(status)) {
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
+ continue;
+ }
+
/* No need to wait on the buffer if it's managed by JBD. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -170,8 +196,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* so we can safely record this and loop back
* to cleanup the other buffers. */
status = -EIO;
- put_bh(bh);
- bhs[i - 1] = NULL;
+ goto read_failure;
}
}
@@ -179,6 +204,9 @@ bail:
return status;
}
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -188,6 +216,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
int i, ignore_cache = 0;
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ int new_bh = 0;
trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
@@ -213,6 +242,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
goto bail;
}
+ /* Don't put buffer head and re-assign it to NULL if it is allocated
+ * outside since the caller can't be aware of this alternation!
+ */
+ new_bh = (bhs[0] == NULL);
+
ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
@@ -221,7 +255,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ /* Don't forget to put previous bh! */
+ break;
}
}
bh = bhs[i];
@@ -316,16 +351,27 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
}
- status = 0;
-
+read_failure:
for (i = (nr - 1); i >= 0; i--) {
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
- if (status) {
- /* Clear the rest of the buffers on error */
- put_bh(bh);
- bhs[i] = NULL;
+ if (unlikely(status)) {
+ /* Clear the buffers on error including those
+ * ever succeeded in reading
+ */
+ if (new_bh && bh) {
+ /* If middle bh fails, let previous bh
+ * finish its read and then put it to
+ * aovoid bh leak
+ */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+ put_bh(bh);
+ bhs[i] = NULL;
+ } else if (bh && buffer_uptodate(bh)) {
+ clear_buffer_uptodate(bh);
+ }
continue;
}
/* We know this can't have changed as we hold the
@@ -342,9 +388,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- put_bh(bh);
- bhs[i] = NULL;
- continue;
+ goto read_failure;
}
if (buffer_needs_validate(bh)) {
@@ -354,11 +398,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
status = validate(sb, bh);
- if (status) {
- put_bh(bh);
- bhs[i] = NULL;
- continue;
- }
+ if (status)
+ goto read_failure;
}
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b048d4fa3959..c121abbdfc7d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1897,8 +1897,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
/* On error, skip the f_pos to the
next block. */
ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
- brelse(bh);
- continue;
+ break;
}
if (le64_to_cpu(de->inode)) {
unsigned char d_type = DT_UNKNOWN;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8e712b614e6e..040ddb6e7dab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2121,10 +2121,10 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
/* LVB only has room for 64 bits of time here so we pack it for
* now. */
-static u64 ocfs2_pack_timespec(struct timespec *spec)
+static u64 ocfs2_pack_timespec(struct timespec64 *spec)
{
u64 res;
- u64 sec = spec->tv_sec;
+ u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
u32 nsec = spec->tv_nsec;
res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
@@ -2140,7 +2140,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2161,15 +2160,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- ts = timespec64_to_timespec(inode->i_atime);
lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_ctime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
- ts = timespec64_to_timespec(inode->i_mtime);
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ts));
+ cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2178,7 +2174,7 @@ out:
mlog_meta_lvb(0, lockres);
}
-static void ocfs2_unpack_timespec(struct timespec *spec,
+static void ocfs2_unpack_timespec(struct timespec64 *spec,
u64 packed_time)
{
spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
@@ -2187,7 +2183,6 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
- struct timespec ts;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
@@ -2215,15 +2210,12 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
- inode->i_atime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_mtime,
be64_to_cpu(lvb->lvb_imtime_packed));
- inode->i_mtime = timespec_to_timespec64(ts);
- ocfs2_unpack_timespec(&ts,
+ ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
- inode->i_ctime = timespec_to_timespec64(ts);
spin_unlock(&oi->ip_lock);
}
@@ -3601,7 +3593,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
- if (!ocfs2_is_o2cb_active() &&
+ if (ocfs2_userspace_stack(osb) &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9fa35cb6f6e0..1ac9547a2cd7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2343,7 +2343,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(written == -EIOCBQUEUED && !direct_io);
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -2463,7 +2463,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !direct_io);
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bd3475694e83..b63c97f4318e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1378,15 +1378,23 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
+ /* Whether the quota supported. */
+ int quota_enabled = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ || OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA);
+
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
- rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
- if (!rm_quota) {
- status = -ENOMEM;
- goto bail;
+ if (quota_enabled) {
+ rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
}
restart:
status = ocfs2_super_lock(osb, 1);
@@ -1422,9 +1430,14 @@ restart:
* then quota usage would be out of sync until some node takes
* the slot. So we remember which nodes need quota recovery
* and when everything else is done, we recover quotas. */
- for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
- if (i == rm_quota_used)
- rm_quota[rm_quota_used++] = slot_num;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used
+ && rm_quota[i] != slot_num; i++)
+ ;
+
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+ }
status = ocfs2_recover_node(osb, node_num, slot_num);
skip_recovery:
@@ -1452,16 +1465,19 @@ skip_recovery:
/* Now it is right time to recover quotas... We have to do this under
* superblock lock so that no one can start using the slot (and crash)
* before we recover it */
- for (i = 0; i < rm_quota_used; i++) {
- qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
- if (IS_ERR(qrec)) {
- status = PTR_ERR(qrec);
- mlog_errno(status);
- continue;
+ if (quota_enabled) {
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal,
+ rm_quota[i],
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
- ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec,
- ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
@@ -1483,7 +1499,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- kfree(rm_quota);
+ if (quota_enabled)
+ kfree(rm_quota);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 7eb3b0a6347e..099016c9644e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -25,6 +25,7 @@
#include "ocfs2_ioctl.h"
#include "alloc.h"
+#include "localalloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -162,7 +163,7 @@ out:
* in some cases, we don't need to reserve clusters, just let data_ac
* be NULL.
*/
-static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters_to_move,
u32 extents_to_split,
@@ -192,13 +193,6 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
goto out;
}
- if (data_ac) {
- ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
@@ -233,6 +227,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
struct ocfs2_refcount_tree *ref_tree = NULL;
u32 new_phys_cpos, new_len;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ int need_free = 0;
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
BUG_ON(!ocfs2_is_refcount_inode(inode));
@@ -257,10 +252,11 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
- &context->meta_ac,
- &context->data_ac,
- extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ *len, 1,
+ &context->meta_ac,
+ &context->data_ac,
+ extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -283,6 +279,21 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
}
}
+ /*
+ * Make sure ocfs2_reserve_cluster is called after
+ * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
+ *
+ * If ocfs2_reserve_cluster is called
+ * before __ocfs2_flush_truncate_log, dead lock on global bitmap
+ * may happen.
+ *
+ */
+ ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -308,6 +319,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
if (!partial) {
context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
ret = -ENOSPC;
+ need_free = 1;
goto out_commit;
}
}
@@ -332,6 +344,20 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
mlog_errno(ret);
out_commit:
+ if (need_free && context->data_ac) {
+ struct ocfs2_alloc_context *data_ac = context->data_ac;
+
+ if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ new_phys_cpos, new_len);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
+ new_len);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
@@ -600,9 +626,10 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
}
}
- ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
- &context->meta_ac,
- NULL, extra_blocks, &credits);
+ ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
+ len, 1,
+ &context->meta_ac,
+ NULL, extra_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7869622af22a..7a5ee145c733 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_SIZE - 1))
to = map_end & (PAGE_SIZE - 1);
+retry:
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
@@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
/*
- * In case PAGE_SIZE <= CLUSTER_SIZE, This page
- * can't be dirtied before we CoW it out.
+ * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
+ * page, so write it back.
*/
- if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
- BUG_ON(PageDirty(page));
+ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
+ if (PageDirty(page)) {
+ /*
+ * write_on_page will unlock the page on return
+ */
+ ret = write_one_page(page);
+ goto retry;
+ }
+ }
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d6c350ba25b9..c4b029c43464 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
-inline int ocfs2_is_o2cb_active(void)
-{
- return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
-}
-EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
-
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e3036e1790e8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
-int ocfs2_is_o2cb_active(void);
-
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ad72261ee3fe..50036f6e1f52 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -451,7 +451,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
*/
- if (!m || start < m->addr || start >= m->addr + m->size) {
+ if (!m || &m->list == &kclist_head || start < m->addr ||
+ start >= m->addr + m->size) {
list_for_each_entry(m, &kclist_head, list) {
if (start >= m->addr &&
start < m->addr + m->size)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 48cdfc81fe10..32d8986c26fb 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -185,6 +185,7 @@ struct reiserfs_dentry_buf {
struct dir_context ctx;
struct dentry *xadir;
int count;
+ int err;
struct dentry *dentries[8];
};
@@ -207,6 +208,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
+ dbuf->err = PTR_ERR(dentry);
return PTR_ERR(dentry);
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
@@ -215,6 +217,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
"not found for file %pd.\n",
dentry, dbuf->xadir);
dput(dentry);
+ dbuf->err = -EIO;
return -EIO;
}
@@ -262,6 +265,10 @@ static int reiserfs_for_each_xattr(struct inode *inode,
err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
if (err)
break;
+ if (buf.err) {
+ err = buf.err;
+ break;
+ }
if (!buf.count)
break;
for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 88ebc6102c7c..5657a20e0c59 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
/*
* Interfaces that can be used by architecture code to keep track of
* memory type of pfn mappings specified by the remap_pfn_range,
- * vm_insert_pfn.
+ * vmf_insert_pfn.
*/
/*
@@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
/*
* track_pfn_insert is called when a _new_ single pfn is established
- * by vm_insert_pfn().
+ * by vmf_insert_pfn().
*/
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
pfn_t pfn)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index acf5e8df3504..f58e97446abc 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -28,8 +28,8 @@
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
- * Note that nbits should be always a compile time evaluable constant.
- * Otherwise many inlines will generate horrible code.
+ * The generated code is more efficient when nbits is known at
+ * compile-time and at most BITS_PER_LONG.
*
* ::
*
@@ -204,38 +204,31 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+/*
+ * The static inlines below do not handle constant nbits==0 correctly,
+ * so make such users (should any ever turn up) call the out-of-line
+ * versions.
+ */
#define small_const_nbits(nbits) \
- (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
- if (small_const_nbits(nbits))
- *dst = 0UL;
- else {
- unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
- memset(dst, 0, len);
- }
+ unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ memset(dst, 0, len);
}
static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
- if (small_const_nbits(nbits))
- *dst = ~0UL;
- else {
- unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
- memset(dst, 0xff, len);
- }
+ unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ memset(dst, 0xff, len);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
unsigned int nbits)
{
- if (small_const_nbits(nbits))
- *dst = *src;
- else {
- unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
- memcpy(dst, src, len);
- }
+ unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ memcpy(dst, src, len);
}
/*
@@ -398,7 +391,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
}
static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
- unsigned int shift, int nbits)
+ unsigned int shift, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4c92e3ba3e16..dde947083d4e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -107,7 +107,7 @@ enum hmm_pfn_flag_e {
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
* HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
- * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
* be mirrored by a device, because the entry will never have HMM_PFN_VALID
* set and the pfn value is undefined.
*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..087fd5f48c91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
@@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void)
return 0;
}
+static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr,
+ pte_t *ptep)
+{
+ return 0;
+}
+
+static inline void adjust_range_if_pmd_sharing_possible(
+ struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
+
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3555d54bf79a..9a4258154b25 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -6,6 +6,7 @@
#include <linux/bitmap.h>
#include <linux/mm.h>
#include <linux/types.h>
+#include <linux/mm_types.h>
struct address_space;
struct fiemap_extent_info;
@@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops);
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
+ const struct iomap_ops *ops);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
loff_t start, loff_t len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 837f2f2d1d34..94af3d9c73e7 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -281,4 +281,6 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
+#define DIV64_U64_ROUND_UP(ll, d) div64_u64((ll) + (d) - 1, (d))
+
#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 516920549378..58697ad64cd3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -232,6 +232,8 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+int memblock_search_pfn_regions(unsigned long pfn);
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 652f602167df..4399cc3f00e4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg);
+
+#ifdef CONFIG_MEMCG_KMEM
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void memcg_kmem_uncharge(struct page *page, int order);
-#ifdef CONFIG_MEMCG_KMEM
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id);
extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
int nid, int shrinker_id);
#else
+
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..bd5e2469b637 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2478,11 +2478,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
@@ -2501,32 +2501,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
-{
- int err = vm_insert_mixed(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
-static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn)
-{
- int err = vm_insert_pfn(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 133ba78820ee..9893a6432adf 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,7 +2,6 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
-#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -11,9 +10,6 @@
struct mmu_notifier;
struct mmu_notifier_ops;
-/* mmu_notifier_ops flags */
-#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
-
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -31,15 +27,6 @@ struct mmu_notifier_mm {
struct mmu_notifier_ops {
/*
- * Flags to specify behavior of callbacks for this MMU notifier.
- * Used to determine which context an operation may be called.
- *
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
- * block
- */
- int flags;
-
- /*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
* freed. This can run concurrently with other mmu notifier
@@ -153,7 +140,9 @@ struct mmu_notifier_ops {
*
* If blockable argument is set to false then the callback cannot
* sleep and has to return with -EAGAIN. 0 should be returned
- * otherwise.
+ * otherwise. Please note that if invalidate_range_start approves
+ * a non-blocking behavior then the same applies to
+ * invalidate_range_end.
*
*/
int (*invalidate_range_start)(struct mmu_notifier *mn,
@@ -181,10 +170,6 @@ struct mmu_notifier_ops {
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
- *
- * If this callback cannot block, and invalidate_range_{start,end}
- * cannot block, mmu_notifier_ops.flags should have
- * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
-static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- return false;
-}
-
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1e22d96734e0..f09d27cb3403 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1248,6 +1248,8 @@ static inline int pfn_valid(unsigned long pfn)
return 0;
return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
}
+
+#define next_valid_pfn(pfn) (pfn + 1)
#endif
static inline int pfn_present(unsigned long pfn)
@@ -1272,7 +1274,16 @@ static inline int pfn_present(unsigned long pfn)
#define pfn_to_nid(pfn) (0)
#endif
+#ifdef CONFIG_HAVE_MEMBLOCK_PFN_VALID
+extern unsigned long memblock_next_valid_pfn(unsigned long pfn);
+#define next_valid_pfn(pfn) memblock_next_valid_pfn(pfn)
+
+extern int pfn_valid_region(ulong pfn);
+#define early_pfn_valid(pfn) pfn_valid_region(pfn)
+#else
#define early_pfn_valid(pfn) pfn_valid(pfn)
+#endif /*CONFIG_HAVE_ARCH_PFN_VALID*/
+
void sparse_init(void);
#else
#define sparse_init() do {} while (0)
@@ -1294,6 +1305,11 @@ struct mminit_pfnnid_cache {
#define early_pfn_valid(pfn) (1)
#endif
+/* fallback to default definitions*/
+#ifndef next_valid_pfn
+#define next_valid_pfn(pfn) (pfn + 1)
+#endif
+
void memory_present(int nid, unsigned long start, unsigned long end);
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a817d82b267e..4de634adc601 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1194,6 +1194,13 @@ struct task_struct {
void *security;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;
unsigned long prev_lowest_stack;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8e2c11e692ba..ca7c6307bda7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,8 +172,9 @@ enum {
SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_VALID = (1 << 12), /* swap is valid to be operated on? */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -460,7 +461,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
-extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry);
+extern int __swap_count(swp_entry_t entry);
extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
@@ -470,6 +471,12 @@ extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
+extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+ preempt_enable();
+}
#else /* CONFIG_SWAP */
@@ -575,7 +582,7 @@ static inline int page_swapcount(struct page *page)
return 0;
}
-static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+static inline int __swap_count(swp_entry_t entry)
{
return 0;
}
diff --git a/init/do_mounts.c b/init/do_mounts.c
index d95435fd37b5..ea9f18047087 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -168,6 +168,24 @@ done:
}
return res;
}
+
+/**
+ * match_dev_by_label - callback for finding a partition using its label
+ * @dev: device passed in by the caller
+ * @data: opaque pointer to the label to match
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int match_dev_by_label(struct device *dev, const void *data)
+{
+ const char *label = data;
+ struct hd_struct *part = dev_to_part(dev);
+
+ if (part->info && !strcmp(label, part->info->volname))
+ return 1;
+
+ return 0;
+}
#endif
/*
@@ -191,6 +209,8 @@ done:
* a partition with a known unique id.
* 8) <major>:<minor> major and minor number of the device separated by
* a colon.
+ * 9) PARTLABEL=<name> with name being the GPT partition label.
+ * MSDOS partitions do not support labels!
*
* If name doesn't have fall into the categories above, we return (0,0).
* block_class is used to check if something is a disk name. If the disk
@@ -212,6 +232,17 @@ dev_t name_to_dev_t(const char *name)
if (!res)
goto fail;
goto done;
+ } else if (strncmp(name, "PARTLABEL=", 10) == 0) {
+ struct device *dev;
+
+ dev = class_find_device(&block_class, NULL, name + 10,
+ &match_dev_by_label);
+ if (!dev)
+ goto fail;
+
+ res = dev->devt;
+ put_device(dev);
+ goto done;
}
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 2264904fd896..e398f24f7c9a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
return s->addr;
}
+ /*
+ * Allocated stacks are cached and later reused by new threads,
+ * so memcg accounting is performed manually on assigning/releasing
+ * stacks to tasks. Drop __GFP_ACCOUNT.
+ */
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
@@ -249,9 +254,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
- if (task_stack_vm_area(tsk)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ if (vm) {
int i;
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ -(int)(PAGE_SIZE / 1024));
+
+ memcg_kmem_uncharge(vm->pages[i], 0);
+ }
+
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
NULL, tsk->stack_vm_area) != NULL)
@@ -352,10 +367,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
NR_KERNEL_STACK_KB,
PAGE_SIZE / 1024 * account);
}
-
- /* All stack pages belong to the same memcg. */
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -371,6 +382,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
}
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+#ifdef CONFIG_VMAP_STACK
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int ret;
+
+ if (vm) {
+ int i;
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ /*
+ * If memcg_kmem_charge() fails, page->mem_cgroup
+ * pointer is NULL, and both memcg_kmem_uncharge()
+ * and mod_memcg_page_state() in free_thread_stack()
+ * will ignore this page. So it's safe.
+ */
+ ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ return ret;
+
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024);
+ }
+ }
+#endif
+ return 0;
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(tsk->state != TASK_DEAD))
@@ -808,6 +848,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!stack)
goto free_tsk;
+ if (memcg_charge_kernel_stack(tsk))
+ goto free_stack;
+
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
diff --git a/kernel/pid.c b/kernel/pid.c
index de1cfc4f75a2..cdf63e53a014 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
idr_preload_end();
if (nr < 0) {
- retval = nr;
+ retval = (nr == -ENOSPC) ? -EAGAIN : nr;
goto out_free;
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4966c4fbe7f7..3e50cc93572c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2034,6 +2034,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 2fd07f6df0b8..eead55aa7170 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -13,6 +13,7 @@
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/kernel.h>
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -36,11 +37,6 @@
* carefully filter out these unused bits from impacting their
* results.
*
- * These operations actually hold to a slightly stronger rule:
- * if you don't input any bitmaps to these ops that have some
- * unused bits set, then they won't output any set unused bits
- * in output bitmaps.
- *
* The byte ordering of bitmaps is more natural on little
* endian architectures. See the big-endian headers
* include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
@@ -466,20 +462,18 @@ EXPORT_SYMBOL(bitmap_parse_user);
* ranges if list is specified or hex digits grouped into comma-separated
* sets of 8 digits/set. Returns the number of characters written to buf.
*
- * It is assumed that @buf is a pointer into a PAGE_SIZE area and that
- * sufficient storage remains at @buf to accommodate the
- * bitmap_print_to_pagebuf() output.
+ * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
+ * area and that sufficient storage remains at @buf to accommodate the
+ * bitmap_print_to_pagebuf() output. Returns the number of characters
+ * actually printed to @buf, excluding terminating '\0'.
*/
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
int nmaskbits)
{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
+ ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);
- if (len > 1)
- n = list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
- scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
- return n;
+ return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
+ scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);
diff --git a/lib/parser.c b/lib/parser.c
index 3278958b472a..618c36ec8efe 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -166,13 +166,10 @@ static int match_u64int(substring_t *s, u64 *result, int base)
char *buf;
int ret;
u64 val;
- size_t len = s->to - s->from;
- buf = kmalloc(len + 1, GFP_KERNEL);
+ buf = match_strdup(s);
if (!buf)
return -ENOMEM;
- memcpy(buf, s->from, len);
- buf[len] = '\0';
ret = kstrtoull(buf, base, &val);
if (!ret)
@@ -327,10 +324,6 @@ EXPORT_SYMBOL(match_strlcpy);
*/
char *match_strdup(const substring_t *s)
{
- size_t sz = s->to - s->from + 1;
- char *p = kmalloc(sz, GFP_KERNEL);
- if (p)
- match_strlcpy(p, s, sz);
- return p;
+ return kmemdup_nul(s->from, s->to - s->from, GFP_KERNEL);
}
EXPORT_SYMBOL(match_strdup);
diff --git a/mm/Kconfig b/mm/Kconfig
index a550635ea5c3..7bf074bf79e5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -136,6 +136,9 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
+config HAVE_MEMBLOCK_PFN_VALID
+ bool
+
config HAVE_GENERIC_GUP
bool
@@ -637,6 +640,7 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on NO_BOOTMEM
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
+ depends on 64BIT
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..a22a94b40de1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -221,11 +221,46 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strictlimit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int val;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ break;
+ case 1:
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+static ssize_t strictlimit_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strictlimit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strictlimit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f28e6f4..de6fed2a0815 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2748,9 +2748,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
#else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- return -ENOSYS;
+ return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..9530d7c6f9b3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- const unsigned long mmun_start = start; /* For mmu_notifiers */
- const unsigned long mmun_end = end; /* For mmu_notifiers */
+ unsigned long mmun_start = start; /* For mmu_notifiers */
+ unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
+
+ /*
+ * If sharing possible, alert mmu notifiers of worst case.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
for (; address < end; address += sz) {
@@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep)) {
spin_unlock(ptl);
+ /*
+ * We just unmapped a page of PMDs by clearing a PUD.
+ * The caller's TLB flush range should cover this area.
+ */
continue;
}
@@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
{
struct mm_struct *mm;
struct mmu_gather tlb;
+ unsigned long tlb_start = start;
+ unsigned long tlb_end = end;
+
+ /*
+ * If shared PMDs were possibly used within this vma range, adjust
+ * start/end for worst case tlb flushing.
+ * Note that we can not be sure if PMDs are shared until we try to
+ * unmap pages. However, we want to make sure TLB flushing covers
+ * the largest possible range.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb, tlb_start, tlb_end);
}
/*
@@ -4298,11 +4318,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
+ unsigned long f_start = start;
+ unsigned long f_end = end;
+ bool shared_pmd = false;
+
+ /*
+ * In the case of shared PMDs, the area to flush could be beyond
+ * start/end. Set f_start/f_end to cover the maximum possible
+ * range if PMD sharing is possible.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
BUG_ON(address >= end);
- flush_cache_range(vma, address, end);
+ flush_cache_range(vma, f_start, f_end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, f_start, f_end);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4313,6 +4343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
spin_unlock(ptl);
+ shared_pmd = true;
continue;
}
pte = huge_ptep_get(ptep);
@@ -4348,9 +4379,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
- * and that page table be reused and filled with junk.
+ * and that page table be reused and filled with junk. If we actually
+ * did unshare a page of pmds, flush the range corresponding to the pud.
*/
- flush_hugetlb_tlb_range(vma, start, end);
+ if (shared_pmd)
+ flush_hugetlb_tlb_range(vma, f_start, f_end);
+ else
+ flush_hugetlb_tlb_range(vma, start, end);
/*
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
@@ -4358,7 +4393,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ mmu_notifier_invalidate_range_end(mm, f_start, f_end);
return pages << h->order;
}
@@ -4537,6 +4572,9 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
return saddr;
}
+#define _range_in_vma(vma, start, end) \
+ ((vma)->vm_start <= (start) && (end) <= (vma)->vm_end)
+
static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
@@ -4545,13 +4583,41 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
+ if (vma->vm_flags & VM_MAYSHARE && _range_in_vma(vma, base, end))
return true;
return false;
}
/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long check_addr = *start;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+ unsigned long a_start = check_addr & PUD_MASK;
+ unsigned long a_end = a_start + PUD_SIZE;
+
+ /*
+ * If sharing is possible, adjust start/end if necessary.
+ */
+ if (_range_in_vma(vma, a_start, a_end)) {
+ if (a_start < *start)
+ *start = a_start;
+ if (a_end > *end)
+ *end = a_end;
+ }
+ }
+}
+
+/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
@@ -4648,6 +4714,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17dd883198ae..4f7e4b5a2f08 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -86,6 +86,7 @@
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/spinlock.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
@@ -181,6 +182,7 @@ struct kmemleak_object {
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
/* number of bytes to print at a time (1, 2, 4, 8) */
@@ -235,6 +237,9 @@ static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
+static bool kmemleak_verbose;
+module_param_named(verbose, kmemleak_verbose, bool, 0600);
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
* kernel allocator. However, both the kernel allocator and kmemleak may
@@ -299,6 +304,25 @@ static void kmemleak_disable(void);
kmemleak_disable(); \
} while (0)
+#define warn_or_seq_printf(seq, fmt, ...) do { \
+ if (seq) \
+ seq_printf(seq, fmt, ##__VA_ARGS__); \
+ else \
+ pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
+ int rowsize, int groupsize, const void *buf,
+ size_t len, bool ascii)
+{
+ if (seq)
+ seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+ else
+ print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+}
+
/*
* Printing of the objects hex dump to the seq file. The number of lines to be
* printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
@@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq,
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
- seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
- seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
kasan_enable_current();
}
@@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq,
int i;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
- seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
- seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
object->comm, object->pid, object->jiffies,
msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
- seq_printf(seq, " backtrace:\n");
+ warn_or_seq_printf(seq, " backtrace:\n");
for (i = 0; i < object->trace_len; i++) {
void *ptr = (void *)object->trace[i];
- seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
}
}
@@ -1598,6 +1622,10 @@ static void kmemleak_scan(void)
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
+
+ if (kmemleak_verbose)
+ print_unreferenced(NULL, object);
+
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625fd365..89349a0276de 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/prefetch.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
@@ -154,6 +155,12 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
+ /*
+ * Prefetch the neighboring list entries to reduce lock hold time.
+ */
+ prefetchw(item->prev);
+ prefetchw(item->next);
+
spin_lock(&nlru->lock);
if (!list_empty(item)) {
l = list_lru_from_kmem(nlru, item, NULL);
diff --git a/mm/memblock.c b/mm/memblock.c
index 237944479d25..b9f593da7317 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1231,6 +1231,81 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_PFN_VALID
+static int early_region_idx __initdata_memblock = -1;
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *regions = type->regions;
+ uint right = type->cnt;
+ uint mid, left = 0;
+ unsigned long start_pfn, end_pfn, next_start_pfn;
+ phys_addr_t addr = PFN_PHYS(++pfn);
+
+ /* fast path, return pfn+1 if next pfn is in the same region */
+ if (early_region_idx != -1) {
+ start_pfn = PFN_DOWN(regions[early_region_idx].base);
+ end_pfn = PFN_DOWN(regions[early_region_idx].base +
+ regions[early_region_idx].size);
+
+ if (pfn >= start_pfn && pfn < end_pfn)
+ return pfn;
+
+ early_region_idx++;
+ next_start_pfn = PFN_DOWN(regions[early_region_idx].base);
+
+ if (pfn >= end_pfn && pfn <= next_start_pfn)
+ return next_start_pfn;
+ }
+
+ /* slow path, do the binary searching */
+ do {
+ mid = (right + left) / 2;
+
+ if (addr < regions[mid].base)
+ right = mid;
+ else if (addr >= (regions[mid].base + regions[mid].size))
+ left = mid + 1;
+ else {
+ early_region_idx = mid;
+ return pfn;
+ }
+ } while (left < right);
+
+ if (right == type->cnt)
+ return -1UL;
+
+ early_region_idx = right;
+
+ return PHYS_PFN(regions[early_region_idx].base);
+}
+EXPORT_SYMBOL(memblock_next_valid_pfn);
+
+int pfn_valid_region(ulong pfn)
+{
+ ulong start_pfn, end_pfn;
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *regions = type->regions;
+
+ if (early_region_idx != -1) {
+ start_pfn = PFN_DOWN(regions[early_region_idx].base);
+ end_pfn = PFN_DOWN(regions[early_region_idx].base +
+ regions[early_region_idx].size);
+
+ if (pfn >= start_pfn && pfn < end_pfn)
+ return !memblock_is_nomap(
+ &regions[early_region_idx]);
+ }
+
+ early_region_idx = memblock_search_pfn_regions(pfn);
+ if (early_region_idx == -1)
+ return false;
+
+ return !memblock_is_nomap(&regions[early_region_idx]);
+}
+EXPORT_SYMBOL(pfn_valid_region);
+#endif /*CONFIG_HAVE_MEMBLOCK_PFN_VALID*/
+
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
@@ -1718,6 +1793,12 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
+/* search memblock with the input pfn, return the region idx */
+int __init_memblock memblock_search_pfn_regions(unsigned long pfn)
+{
+ return memblock_search(&memblock.memory, PFN_PHYS(pfn));
+}
+
bool __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e79cb59552d9..fcec9b39e2a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4573,6 +4573,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
diff --git a/mm/memory.c b/mm/memory.c
index 21a5e6e4758b..96caf5fe7531 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1520,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
- int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
- retval = -EBUSY;
+ return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
@@ -1565,56 +1562,32 @@ out_mkwrite:
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
- retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-out:
- return retval;
-}
-
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_insert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- *
- * vma cannot be a COW mapping.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
-{
- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_pfn);
/**
- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers to
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
- * cow mappings. In general, using multiple vmas is preferable;
- * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
*/
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
- int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1628,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
- return ret;
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
-EXPORT_SYMBOL(vm_insert_pfn_prot);
+EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
@@ -1656,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
return false;
}
-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, bool mkwrite)
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
+ int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1688,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
}
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
-
}
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
-
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- int err;
-
- err = __vm_insert_mixed(vma, addr, pfn, true);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE;
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -2695,7 +2693,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -2805,7 +2803,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -2819,6 +2816,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index da858f794eb6..2e76a8f65e94 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(unsigned long addr)
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
+ if (locked)
+ up_read(&mm->mmap_sem);
return err;
}
@@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(addr);
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_sem, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
@@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&current->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..a66f2052c7b1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -68,8 +68,16 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
*/
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
+ struct swap_info_struct *si;
+
+ /* Prevent swap device to being swapoff under us */
+ si = get_swap_device(swp);
+ if (si) {
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ put_swap_device(si);
+ } else
+ page = NULL;
}
} else
page = find_get_page(mapping, pgoff);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 82bb1a939c0e..5119ff846769 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
-/*
- * Must be called while holding mm->mmap_sem for either read or write.
- * The result is guaranteed to be valid until mm->mmap_sem is dropped.
- */
-bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- struct mmu_notifier *mn;
- int id;
- bool ret = false;
-
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
-
- if (!mm_has_notifiers(mm))
- return ret;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (!mn->ops->invalidate_range &&
- !mn->ops->invalidate_range_start &&
- !mn->ops->invalidate_range_end)
- continue;
-
- if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = true;
- break;
- }
- }
- srcu_read_unlock(&srcu, id);
- return ret;
-}
-
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..61e664d8947e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -306,24 +306,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
/* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
+ nr_initialised++;
+ if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
}
-
- return true;
+ return false;
}
#else
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -331,11 +340,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -3922,6 +3929,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
{
struct zone *zone;
struct zoneref *z;
+ bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3993,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
}
}
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that
- * a particular WQ is congested if the worker thread is
- * looping without ever sleeping. Therefore we have to
- * do a short sleep here rather than calling
- * cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
- return true;
+ ret = true;
+ goto out;
}
}
- return false;
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
}
static inline bool
@@ -5449,6 +5456,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -5458,14 +5489,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -5482,39 +5507,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn)) {
+ pfn = next_valid_pfn(pfn) - 1;
continue;
}
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
@@ -5531,9 +5536,6 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -5551,10 +5553,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -6815,15 +6818,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
- if (N_MEMORY == N_NORMAL_MEMORY)
- return;
-
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
- node_set_state(nid, N_HIGH_MEMORY);
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
- zone_type <= ZONE_NORMAL)
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d80adfe702d3..c2494f034d02 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -274,7 +274,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
*/
for (; pfn < end_pfn; ) {
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
@@ -541,7 +541,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
unsigned long block_end_pfn;
if (!pfn_valid(pfn)) {
- pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ pfn = ALIGN(pfn + 1, pageblock_nr_pages);
continue;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..1e79fac3186b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free in this function as call of try_to_unmap()
- * must hold a reference on the page.
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
*/
end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ }
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
+ if (PageHuge(page)) {
+ if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, start, end);
+ flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
diff --git a/mm/slub.c b/mm/slub.c
index 8da34a8af53d..37e82a0538aa 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3621,9 +3621,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
- sizeof(long),
- GFP_ATOMIC);
+ unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, text, s->name);
@@ -3638,7 +3636,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
slab_unlock(page);
- kfree(map);
+ bitmap_free(map);
#endif
}
@@ -4411,10 +4409,8 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map)
return -ENOMEM;
@@ -4422,7 +4418,7 @@ static long validate_slab_cache(struct kmem_cache *s)
flush_all(s);
for_each_kmem_cache_node(s, node, n)
count += validate_slab_node(s, n, map);
- kfree(map);
+ bitmap_free(map);
return count;
}
/*
@@ -4573,14 +4569,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
- kfree(map);
+ bitmap_free(map);
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4646,7 +4640,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
- kfree(map);
+ bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ecee9c6c4cc1..dc312559f7df 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -334,8 +334,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -378,8 +383,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -389,7 +394,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d954b71c4f9c..97a1bd1a7c9a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,6 +38,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/stop_machine.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -103,26 +104,39 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY 0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED 0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL 0x4
+
/* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
- * This function is called from scan_swap_map() and it's called
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
- * We have to use trylock for avoiding deadlock. This is a special
+ * When this function is called from scan_swap_map_slots() and it's
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * here. We have to use trylock for avoiding deadlock. This is a special
* case and you should use try_to_free_swap() with explicit lock_page()
* in usual operations.
*/
if (trylock_page(page)) {
- ret = try_to_free_swap(page);
+ if ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+ ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
@@ -780,7 +794,7 @@ checks:
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -919,6 +933,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
cluster_set_count_flag(ci, 0, 0);
free_cluster(si, idx);
unlock_cluster(ci);
@@ -1160,6 +1175,64 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
return usage;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * preempt_disable() in get_swap_device() or after the
+ * preempt_enable() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long type, offset;
+
+ if (!entry.val)
+ goto out;
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ goto bad_nofile;
+ si = swap_info[type];
+
+ preempt_disable();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ preempt_enable();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1169,6 +1242,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
ci = lock_cluster_or_swap_info(p, offset);
usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+ if (!usage)
+ free_swap_slot(entry);
return usage;
}
@@ -1199,10 +1274,8 @@ void swap_free(swp_entry_t entry)
struct swap_info_struct *p;
p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, 1))
- free_swap_slot(entry);
- }
+ if (p)
+ __swap_entry_free(p, entry, 1);
}
/*
@@ -1237,9 +1310,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
swap_free_cluster(si, idx);
spin_unlock(&si->lock);
@@ -1334,11 +1404,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1363,9 +1440,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1612,7 +1691,6 @@ int try_to_free_swap(struct page *page)
int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
- struct page *page = NULL;
unsigned char count;
if (non_swap_entry(entry))
@@ -1622,30 +1700,9 @@ int free_swap_and_cache(swp_entry_t entry)
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
- if (page && !trylock_page(page)) {
- put_page(page);
- page = NULL;
- }
- } else if (!count)
- free_swap_slot(entry);
- }
- if (page) {
- /*
- * Not mapped elsewhere, or swap space full? Free it!
- * Also recheck PageSwapCache now page is locked (above).
- */
- if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- }
- unlock_page(page);
- put_page(page);
+ !swap_page_trans_huge_swapped(p, entry))
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
}
return p != NULL;
}
@@ -1783,8 +1840,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1793,6 +1848,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -2434,9 +2491,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2461,7 +2518,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2480,6 +2541,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
add_to_avail_list(p);
}
+static int swap_onoff_stop(void *arg)
+{
+ return 0;
+}
+
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
@@ -2488,7 +2554,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are used
+ * between get/put_swap_device() only if SWP_VALID bit is set
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2497,7 +2573,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2600,6 +2677,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ stop_machine(swap_onoff_stop, NULL, cpu_online_mask);
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -3363,22 +3451,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3424,11 +3506,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3520,6 +3600,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3527,15 +3608,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3553,9 +3634,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3607,10 +3687,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7e7d25504651..b1ebd5cf4953 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2446,9 +2446,11 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 4b366d181f35..201a8ac6342e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -746,6 +746,9 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
if (bud == HEADLESS) {
+ if (test_bit(UNDER_RECLAIM, &page->private))
+ return;
+
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -836,20 +839,20 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
/* candidate found */
break;
- zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr))
continue; /* can't evict at this point */
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
- set_bit(UNDER_RECLAIM, &page->private);
break;
}
+ set_bit(UNDER_RECLAIM, &page->private);
list_del_init(&page->lru);
spin_unlock(&pool->lock);
@@ -898,6 +901,7 @@ next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
return 0;
}
spin_lock(&pool->lock);
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 161b0224d6ae..c883ec55654f 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4934,17 +4934,6 @@ sub process {
while ($line =~ m{($Constant|$Lval)}g) {
my $var = $1;
-#gcc binary extension
- if ($var =~ /^$Binary$/) {
- if (WARN("GCC_BINARY_CONSTANT",
- "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) &&
- $fix) {
- my $hexval = sprintf("0x%x", oct($var));
- $fixed[$fixlinenr] =~
- s/\b$var\b/$hexval/;
- }
- }
-
#CamelCase
if ($var !~ /^$Constant$/ &&
$var =~ /[A-Z][a-z]|[a-z][A-Z]/ &&
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f986e31fa68c..636462edc7ca 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -499,7 +499,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,