aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2015-07-24 14:41:49 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2015-07-24 14:41:49 +1000
commit0ddc1ce53473a25e2163c6cb04a9291582444dce (patch)
tree76a89fe0c83d8bb9962dbc9b2660217e0fda7714
parent3132e645860a96bad81b63412440cd51345d488a (diff)
parent7949ba1dc592e4acf1623a874c639bc4f9f80828 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/blockdev/zram.txt3
-rw-r--r--Documentation/devicetree/bindings/w1/omap-hdq.txt7
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt40
-rw-r--r--Documentation/filesystems/dax.txt7
-rw-r--r--Documentation/filesystems/ocfs2-online-filecheck.txt95
-rw-r--r--Documentation/filesystems/proc.txt18
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/printk-formats.txt8
-rw-r--r--Documentation/sysrq.txt3
-rw-r--r--Documentation/vm/pagemap.txt37
-rw-r--r--Documentation/vm/userfaultfd.txt144
-rw-r--r--Documentation/w1/masters/omap-hdq6
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/include/asm/unistd.h2
-rw-r--r--arch/alpha/include/uapi/asm/mman.h6
-rw-r--r--arch/alpha/include/uapi/asm/unistd.h3
-rw-r--r--arch/alpha/kernel/systbls.S3
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/include/asm/unistd.h2
-rw-r--r--arch/arm/include/uapi/asm/unistd.h3
-rw-r--r--arch/arm/kernel/calls.S3
-rw-r--r--arch/arm/mach-at91/pm.c2
-rw-r--r--arch/arm/mach-imx/pm-imx5.c2
-rw-r--r--arch/arm/mach-imx/pm-imx6.c2
-rw-r--r--arch/arm/mach-socfpga/pm.c2
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h6
-rw-r--r--arch/arm64/kernel/signal32.c5
-rw-r--r--arch/avr32/include/uapi/asm/unistd.h3
-rw-r--r--arch/avr32/kernel/syscall_table.S3
-rw-r--r--arch/blackfin/include/uapi/asm/unistd.h3
-rw-r--r--arch/blackfin/mach-common/entry.S3
-rw-r--r--arch/cris/arch-v10/kernel/entry.S3
-rw-r--r--arch/cris/arch-v32/kernel/entry.S3
-rw-r--r--arch/frv/kernel/entry.S3
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/unistd.h2
-rw-r--r--arch/ia64/include/uapi/asm/unistd.h3
-rw-r--r--arch/ia64/kernel/entry.S3
-rw-r--r--arch/m32r/kernel/entry.S3
-rw-r--r--arch/m32r/kernel/syscall_table.S3
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/m68k/include/asm/unistd.h2
-rw-r--r--arch/m68k/include/uapi/asm/unistd.h3
-rw-r--r--arch/m68k/kernel/syscalltable.S3
-rw-r--r--arch/microblaze/include/uapi/asm/unistd.h3
-rw-r--r--arch/microblaze/kernel/syscall_table.S3
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/include/uapi/asm/mman.h9
-rw-r--r--arch/mips/include/uapi/asm/unistd.h21
-rw-r--r--arch/mips/kernel/scall32-o32.S3
-rw-r--r--arch/mips/kernel/scall64-64.S3
-rw-r--r--arch/mips/kernel/scall64-n32.S3
-rw-r--r--arch/mips/kernel/scall64-o32.S3
-rw-r--r--arch/mips/kernel/signal32.c2
-rw-r--r--arch/mn10300/kernel/entry.S3
-rw-r--r--arch/parisc/include/uapi/asm/mman.h6
-rw-r--r--arch/parisc/include/uapi/asm/unistd.h5
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h2
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h2
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h5
-rw-r--r--arch/powerpc/include/uapi/asm/unistd.h4
-rw-r--r--arch/powerpc/kernel/signal_32.c2
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/include/uapi/asm/unistd.h5
-rw-r--r--arch/s390/kernel/compat_wrapper.c3
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/kernel/syscalls_32.S3
-rw-r--r--arch/sparc/include/asm/pgtable_64.h9
-rw-r--r--arch/sparc/include/uapi/asm/mman.h5
-rw-r--r--arch/sparc/include/uapi/asm/unistd.h5
-rw-r--r--arch/sparc/kernel/systbls_32.S2
-rw-r--r--arch/sparc/kernel/systbls_64.S4
-rw-r--r--arch/tile/Kconfig1
-rw-r--r--arch/tile/include/uapi/asm/mman.h9
-rw-r--r--arch/tile/kernel/compat_signal.c2
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl5
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/vmx.c8
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/tlb.c1
-rw-r--r--arch/x86/platform/efi/efi.c4
-rw-r--r--arch/x86/platform/uv/uv_nmi.c6
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h9
-rw-r--r--arch/xtensa/include/uapi/asm/unistd.h10
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/zram/zram_drv.c15
-rw-r--r--drivers/block/zram/zram_drv.h1
-rw-r--r--drivers/crypto/qat/qat_common/adf_transport_debug.c16
-rw-r--r--drivers/firmware/efi/Kconfig2
-rw-r--r--drivers/gpu/drm/drm_vm.c8
-rw-r--r--drivers/media/platform/coda/coda-common.c2
-rw-r--r--drivers/misc/sram.c8
-rw-r--r--drivers/net/wireless/ath/wil6210/debugfs.c35
-rw-r--r--drivers/parisc/ccio-dma.c13
-rw-r--r--drivers/parisc/sba_iommu.c9
-rw-r--r--drivers/pci/pci-driver.c2
-rw-r--r--drivers/s390/crypto/zcrypt_api.c10
-rw-r--r--drivers/tty/sysrq.c11
-rw-r--r--drivers/w1/masters/omap_hdq.c224
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/aio.c27
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/cachefiles/daemon.c84
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/dax.c152
-rw-r--r--fs/dcache.c15
-rw-r--r--fs/exofs/super.c8
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext4/file.c11
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c1
-rw-r--r--fs/ext4/super.c15
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c61
-rw-r--r--fs/fat/inode.c75
-rw-r--r--fs/file_table.c24
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hugetlbfs/inode.c283
-rw-r--r--fs/inode.c2
-rw-r--r--fs/libfs.c26
-rw-r--r--fs/mpage.c23
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/fsnotify.c10
-rw-r--r--fs/notify/mark.c30
-rw-r--r--fs/ntfs/super.c21
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c245
-rw-r--r--fs/ocfs2/aops.c54
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c123
-rw-r--r--fs/ocfs2/cluster/nodemanager.c50
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c10
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c25
-rw-r--r--fs/ocfs2/filecheck.c569
-rw-r--r--fs/ocfs2/filecheck.h48
-rw-r--r--fs/ocfs2/inode.c245
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c81
-rw-r--r--fs/ocfs2/stack_user.c50
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/suballoc.c90
-rw-r--r--fs/ocfs2/super.c74
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c43
-rw-r--r--fs/proc/page.c286
-rw-r--r--fs/proc/task_mmu.c294
-rw-r--r--fs/seq_file.c42
-rw-r--r--fs/signalfd.c5
-rw-r--r--fs/super.c8
-rw-r--r--fs/userfaultfd.c1330
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_file.c30
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/cred.h8
-rw-r--r--include/linux/dax.h39
-rw-r--r--include/linux/fs.h20
-rw-r--r--include/linux/genalloc.h6
-rw-r--r--include/linux/gfp.h5
-rw-r--r--include/linux/huge_mm.h27
-rw-r--r--include/linux/hugetlb.h17
-rw-r--r--include/linux/kernel.h129
-rw-r--r--include/linux/kexec.h8
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h392
-rw-r--r--include/linux/mm.h176
-rw-r--r--include/linux/mm_types.h12
-rw-r--r--include/linux/mman.h3
-rw-r--r--include/linux/mmu_notifier.h44
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/oom.h38
-rw-r--r--include/linux/page-flags.h258
-rw-r--r--include/linux/page_ext.h4
-rw-r--r--include/linux/pagemap.h25
-rw-r--r--include/linux/parse-integer.h188
-rw-r--r--include/linux/poison.h4
-rw-r--r--include/linux/printk.h14
-rw-r--r--include/linux/rmap.h12
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/seq_file.h4
-rw-r--r--include/linux/slab.h10
-rw-r--r--include/linux/smpboot.h11
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--include/linux/userfaultfd_k.h85
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/wait.h5
-rw-r--r--include/linux/zbud.h2
-rw-r--r--include/linux/zpool.h4
-rw-r--r--include/linux/zsmalloc.h6
-rw-r--r--include/net/sock.h28
-rw-r--r--include/trace/events/tlb.h3
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--include/uapi/asm-generic/mman.h5
-rw-r--r--include/uapi/asm-generic/unistd.h8
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--include/uapi/linux/prctl.h7
-rw-r--r--include/uapi/linux/securebits.h11
-rw-r--r--include/uapi/linux/userfaultfd.h169
-rw-r--r--init/Kconfig21
-rw-r--r--init/initramfs.c4
-rw-r--r--init/main.c2
-rw-r--r--ipc/mqueue.c5
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/msgutil.c2
-rw-r--r--ipc/shm.c6
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/events/core.c4
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/kexec.c2533
-rw-r--r--kernel/kexec_core.c1511
-rw-r--r--kernel/kexec_file.c1044
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c15
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/Makefile3
-rw-r--r--lib/bitmap.c43
-rw-r--r--lib/cmdline.c44
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/genalloc.c110
-rw-r--r--lib/iommu-common.c2
-rw-r--r--lib/kstrtox.c254
-rw-r--r--lib/kstrtox.h1
-rw-r--r--lib/parse-integer.c222
-rw-r--r--lib/parser.c33
-rw-r--r--lib/show_mem.c4
-rw-r--r--lib/swiotlb.c2
-rw-r--r--lib/test-kstrtox.c6
-rw-r--r--lib/test-parse-integer.c563
-rw-r--r--lib/vsprintf.c101
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c32
-rw-r--r--mm/debug.c5
-rw-r--r--mm/dmapool.c5
-rw-r--r--mm/filemap.c49
-rw-r--r--mm/gup.c236
-rw-r--r--mm/huge_memory.c231
-rw-r--r--mm/hugetlb.c450
-rw-r--r--mm/hwpoison-inject.c5
-rw-r--r--mm/internal.h20
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/madvise.c189
-rw-r--r--mm/memblock.c14
-rw-r--r--mm/memcontrol.c497
-rw-r--r--mm/memory-failure.c50
-rw-r--r--mm/memory.c72
-rw-r--r--mm/memory_hotplug.c11
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/memtest.c23
-rw-r--r--mm/migrate.c34
-rw-r--r--mm/mlock.c162
-rw-r--r--mm/mmap.c87
-rw-r--r--mm/mmu_notifier.c17
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c56
-rw-r--r--mm/msync.c2
-rw-r--r--mm/oom_kill.c142
-rw-r--r--mm/page_alloc.c119
-rw-r--r--mm/page_ext.c3
-rw-r--r--mm/page_isolation.c37
-rw-r--r--mm/rmap.c183
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slab.h9
-rw-r--r--mm/slab_common.c31
-rw-r--r--mm/slob.c13
-rw-r--r--mm/slub.c204
-rw-r--r--mm/swap.c49
-rw-r--r--mm/swap_state.c41
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/userfaultfd.c308
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmscan.c211
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zbud.c10
-rw-r--r--mm/zpool.c4
-rw-r--r--mm/zsmalloc.c232
-rw-r--r--mm/zswap.c75
-rw-r--r--net/sunrpc/sched.c2
-rwxr-xr-xscripts/Lindent3
-rwxr-xr-xscripts/checkpatch.pl115
-rwxr-xr-xscripts/decode_stacktrace.sh5
-rwxr-xr-xscripts/kernel-doc38
-rw-r--r--scripts/spelling.txt4
-rw-r--r--security/commoncap.c103
-rw-r--r--security/keys/process_keys.c1
-rw-r--r--sound/core/oss/mixer_oss.c2
-rw-r--r--sound/core/oss/pcm_oss.c4
-rw-r--r--sound/core/pcm.c2
-rw-r--r--sound/core/pcm_memory.c3
-rw-r--r--sound/pci/ac97/ac97_codec.c8
-rw-r--r--sound/soc/soc-core.c9
-rw-r--r--tools/testing/selftests/vm/Makefile6
-rw-r--r--tools/testing/selftests/vm/lock-on-fault.c344
-rw-r--r--tools/testing/selftests/vm/mlock2-tests.c617
-rw-r--r--tools/testing/selftests/vm/on-fault-limit.c47
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests44
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c636
-rw-r--r--tools/vm/page-types.c35
-rw-r--r--virt/kvm/kvm_main.c31
362 files changed, 15276 insertions, 5948 deletions
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index c4de576093af..62435bb25266 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -144,7 +144,8 @@ mem_used_max RW the maximum amount memory zram have consumed to
store compressed data
mem_limit RW the maximum amount of memory ZRAM can use to store
the compressed data
-num_migrated RO the number of objects migrated migrated by compaction
+pages_compacted RO the number of pages freed during compaction
+ (available only via zram<id>/mm_stat node)
compact WO trigger memory compaction
WARNING
diff --git a/Documentation/devicetree/bindings/w1/omap-hdq.txt b/Documentation/devicetree/bindings/w1/omap-hdq.txt
index fef794741bd1..913c5f91a0f9 100644
--- a/Documentation/devicetree/bindings/w1/omap-hdq.txt
+++ b/Documentation/devicetree/bindings/w1/omap-hdq.txt
@@ -1,11 +1,15 @@
* OMAP HDQ One wire bus master controller
Required properties:
-- compatible : should be "ti,omap3-1w"
+- compatible : should be "ti,omap3-1w" or "ti,am4372-hdq"
- reg : Address and length of the register set for the device
- interrupts : interrupt line.
- ti,hwmods : "hdq1w"
+Optional properties:
+- ti,mode: should be "hdq": HDQ mode "1w": one-wire mode.
+ If not specified HDQ mode is implied.
+
Example:
- From omap3.dtsi
@@ -14,4 +18,5 @@ Example:
reg = <0x480b2000 0x1000>;
interrupts = <58>;
ti,hwmods = "hdq1w";
+ ti,mode = "hdq";
};
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644
index 000000000000..261b92e2fb1a
--- /dev/null
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -0,0 +1,40 @@
+#
+# Feature name: batch-unmap-tlb-flush
+# Kconfig: ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+# description: arch supports deferral of TLB flush until multiple pages are unmapped
+#
+ -----------------------
+ | arch |status|
+ -----------------------
+ | alpha: | TODO |
+ | arc: | TODO |
+ | arm: | TODO |
+ | arm64: | TODO |
+ | avr32: | .. |
+ | blackfin: | TODO |
+ | c6x: | .. |
+ | cris: | .. |
+ | frv: | .. |
+ | h8300: | .. |
+ | hexagon: | TODO |
+ | ia64: | TODO |
+ | m32r: | TODO |
+ | m68k: | .. |
+ | metag: | TODO |
+ | microblaze: | .. |
+ | mips: | TODO |
+ | mn10300: | TODO |
+ | nios2: | .. |
+ | openrisc: | .. |
+ | parisc: | TODO |
+ | powerpc: | TODO |
+ | s390: | TODO |
+ | score: | .. |
+ | sh: | TODO |
+ | sparc: | TODO |
+ | tile: | TODO |
+ | um: | .. |
+ | unicore32: | .. |
+ | x86: | ok |
+ | xtensa: | TODO |
+ -----------------------
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7af2851d667c..7bde64014a89 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -60,9 +60,10 @@ Filesystem support consists of
- implementing the direct_IO address space operation, and calling
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
- implementing an mmap file operation for DAX files which sets the
- VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
- for fault and page_mkwrite (which should probably call dax_fault() and
- dax_mkwrite(), passing the appropriate get_block() callback)
+ VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+ include handlers for fault, pmd_fault and page_mkwrite (which should
+ probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+ appropriate get_block() callback)
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
- calling dax_zero_page_range() instead of zero_user() for DAX files
- ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt
new file mode 100644
index 000000000000..d3192372e117
--- /dev/null
+++ b/Documentation/filesystems/ocfs2-online-filecheck.txt
@@ -0,0 +1,95 @@
+ OCFS2 online file check
+ -----------------------
+
+This document will describe OCFS2 online file check feature.
+
+Introduction
+============
+OCFS2 is often used in high-availaibility systems. However, OCFS2 usually
+converts the filesystem to read-only on errors. This may not be necessary, since
+turning the filesystem read-only would affect other running processes as well,
+decreasing availability. Then, a mount option (errors=continue) was introduced,
+which would return the EIO to the calling process and terminate furhter
+processing so that the filesystem is not corrupted further. The filesystem is
+not converted to read-only, and the problematic file's inode number is reported
+in the kernel log. The user can try to check/fix this file via online filecheck
+feature.
+
+Scope
+=====
+This effort is to check/fix small issues which may hinder day-to-day operations
+of a cluster filesystem by turning the filesystem read-only. The scope of
+checking/fixing is at the file level, initially for regular files and eventually
+to all files (including system files) of the filesystem.
+
+In case of directory to file links is incorrect, the directory inode is
+reported as erroneous.
+
+This feature is not suited for extravagant checks which involve dependency of
+other components of the filesystem, such as but not limited to, checking if the
+bits for file blocks in the allocation has been set. In case of such an error,
+the offline fsck should/would be recommended.
+
+Finally, such an operation/feature should not be automated lest the filesystem
+may end up with more damage than before the repair attempt. So, this has to
+be performed using user interaction and consent.
+
+User interface
+==============
+When there are errors in the OCFS2 filesystem, they are usually accompanied
+by the inode number which caused the error. This inode number would be the
+input to check/fix the file.
+
+There is a sysfs file for each OCFS2 file system mounting:
+
+ /sys/fs/ocfs2/<devname>/filecheck
+
+Here, <devname> indicates the name of OCFS2 volumn device which has been already
+mounted. The file above would accept inode numbers. This could be used to
+communicate with kernel space, tell which file(inode number) will be checked or
+fixed. Currently, three operations are supported, which includes checking
+inode, fixing inode and setting the size of result record history.
+
+1. If you want to know what error exactly happened to <inode> before fixing, do
+
+ # echo "CHECK <inode>" > /sys/fs/ocfs2/<devname>/filecheck
+ # cat /sys/fs/ocfs2/<devname>/filecheck
+
+The output is like this:
+ INO TYPE DONE ERROR
+39502 0 1 GENERATION
+
+<INO> lists the inode numbers.
+<TYPE> is what kind of operation you've done, 0 for inode check,1 for inode fix.
+<DONE> indicates whether the operation has been finished.
+<ERROR> says what kind of errors was found. For the detailed error numbers,
+please refer to the file linux/fs/ocfs2/filecheck.h.
+
+2. If you determine to fix this inode, do
+
+ # echo "FIX <inode>" > /sys/fs/ocfs2/<devname>/filecheck
+ # cat /sys/fs/ocfs2/<devname>/filecheck
+
+The output is like this:
+ INO TYPE DONE ERROR
+39502 1 1 SUCCESS
+
+This time, the <ERROR> column indicates whether this fix is successful or not.
+
+3. The record cache is used to store the history of check/fix result. Its
+defalut size is 10, and can be adjust between the range of 10 ~ 100. You can
+adjust the size like this:
+
+ # echo "SET <size>" > /sys/fs/ocfs2/<devname>/filecheck
+
+Fixing stuff
+============
+On receivng the inode, the filesystem would read the inode and the
+file metadata. In case of errors, the filesystem would fix the errors
+and report the problems it fixed in the kernel log. As a precautionary measure,
+the inode must first be checked for errors before performing a final fix.
+
+The inode and the result history will be maintained temporarily in a
+small linked list buffer which would contain the last (N) inodes
+fixed/checked, the detailed errors which were fixed/checked are printed in the
+kernel log.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6f7fafde0884..d411ca63c8b6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -424,6 +424,7 @@ Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
Swap: 0 kB
+SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
@@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
(size), the amount of the mapping that is currently resident in RAM (RSS), the
process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping. Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
-by only one process, is accounted as private and not as shared. "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e. is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
"Anonymous" shows the amount of memory that does not belong to any file. Even
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
"Swap" shows how much would-be-anonymous memory is also used, but out on
swap.
-
+"SwapPss" shows proportional swap share of this mapping.
"VmFlags" field deserves a separate description. This member represents the kernel
flags associated with the particular virtual memory area in two letter encoded
manner. The codes are the following:
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1e166ad3e1d7..bb38ebf1fa16 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -302,6 +302,7 @@ Code Seq#(hex) Include File Comments
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
+0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
0xAD 00 Netfilter device in development:
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 2216eb187c21..2ec6d84f391c 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -244,6 +244,14 @@ dentry names:
Passed by reference.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
+ Passed by reference (NULL for "current").
+
struct va_format:
%pV
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 267f39386f99..13f5619b2203 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'e' - Send a SIGTERM to all processes, except for init.
-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.
'g' - Used by kgdb (kernel debugger)
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 6bfbc172cdb9..ae6cf352dec9 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
userspace programs to examine the page tables and related information by
reading files in /proc.
-There are three components to pagemap:
+There are five components to pagemap:
* /proc/pid/pagemap. This file lets a userspace process find out which
physical frame each virtual page is mapped to. It contains one 64-bit
@@ -16,11 +16,17 @@ There are three components to pagemap:
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
- * Bits 56-60 zero
- * Bit 61 page is file-page or shared-anon
+ * Bit 56 page exclusively mapped (since 4.2)
+ * Bits 57-60 zero
+ * Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
* Bit 63 page present
+ Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
+ In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from
+ 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
+ Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
+
If the page is not present but in swap, then the PFN contains an
encoding of the swap file number and the page's offset into the
swap. Unmapped pages return a null PFN. This allows determining
@@ -64,6 +70,21 @@ There are three components to pagemap:
22. THP
23. BALLOON
24. ZERO_PAGE
+ 25. IDLE
+
+ * /proc/kpagecgroup. This file contains a 64-bit inode number of the
+ memory cgroup each page is charged to, indexed by PFN. Only available when
+ CONFIG_MEMCG is set.
+
+ * /proc/kpageidle. This file implements a bitmap where each bit corresponds
+ to a page, indexed by PFN. When the bit is set, the corresponding page is
+ idle. A page is considered idle if it has not been accessed since it was
+ marked idle. To mark a page idle one should set the bit corresponding to the
+ page by writing to the file. A value written to the file is OR-ed with the
+ current bitmap value. Only user memory pages can be marked idle, for other
+ page types input is silently ignored. Writing to this file beyond max PFN
+ results in the ENXIO error. Only available when CONFIG_IDLE_PAGE_TRACKING is
+ set.
Short descriptions to the page flags:
@@ -110,6 +131,11 @@ Short descriptions to the page flags:
24. ZERO_PAGE
zero page for pfn_zero or huge_zero page
+25. IDLE
+ page has not been accessed since it was marked idle (see /proc/kpageidle)
+ Note that this flag may be stale in case the page was accessed via a PTE.
+ To make sure the flag is up-to-date one has to read /proc/kpageidle first.
+
[IO related page flags]
1. ERROR IO error occurred
3. UPTODATE page has up-to-date data
@@ -159,3 +185,8 @@ Other notes:
Reading from any of the files will return -EINVAL if you are not starting
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
into the file), or if the size of the read is not a multiple of 8 bytes.
+
+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
+always 12 at most architectures). Since Linux 3.11 their meaning changes
+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
+flags unconditionally.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 000000000000..70a3c94d1941
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,144 @@
+= Userfaultfd =
+
+== Objective ==
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+== Design ==
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+ happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+ registered in the userfaultfd that allows userland to efficiently
+ resolve the userfaults it receives via 1) or to manage the virtual
+ memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+== API ==
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+== QEMU/KVM ==
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq
index 884dc284b215..234522709a5f 100644
--- a/Documentation/w1/masters/omap-hdq
+++ b/Documentation/w1/masters/omap-hdq
@@ -44,3 +44,9 @@ e.g:
insmod omap_hdq.ko W1_ID=2
inamod w1_bq27000.ko F_ID=2
+The driver also supports 1-wire mode. In this mode, there is no need to
+pass slave ID as parameter. The driver will auto-detect slaves connected
+to the bus using SEARCH_ROM procedure. 1-wire mode can be selected by
+setting "ti,mode" property to "1w" in DT (see
+Documentation/devicetree/bindings/w1/omap-hdq.txt for more details).
+By default driver is in HDQ mode.
diff --git a/arch/Kconfig b/arch/Kconfig
index 8a8ea7110de8..d8998c8d686c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,6 +2,9 @@
# General architecture dependent options
#
+config KEXEC_CORE
+ bool
+
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index a56e608db2f9..1d093927587f 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -3,7 +3,7 @@
#include <uapi/asm/unistd.h>
-#define NR_SYSCALLS 514
+#define NR_SYSCALLS 517
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_STAT64
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 0086b472bc2b..deec059c9617 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -30,6 +30,7 @@
#define MAP_NONBLOCK 0x40000 /* do not block on IO */
#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x100000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x200000 /* Lock pages after they are faulted in, do not prefault */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_SYNC 2 /* synchronous memory sync */
@@ -37,6 +38,10 @@
#define MCL_CURRENT 8192 /* lock all currently mapped pages */
#define MCL_FUTURE 16384 /* lock all additions to address space */
+#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */
+
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
@@ -44,6 +49,7 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_SPACEAVAIL 5 /* ensure resources are available */
#define MADV_DONTNEED 6 /* don't need these pages */
+#define MADV_FREE 7 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h
index aa33bf5aacb6..29141d6a6567 100644
--- a/arch/alpha/include/uapi/asm/unistd.h
+++ b/arch/alpha/include/uapi/asm/unistd.h
@@ -475,5 +475,8 @@
#define __NR_getrandom 511
#define __NR_memfd_create 512
#define __NR_execveat 513
+#define __NR_mlock2 514
+#define __NR_munlock2 515
+#define __NR_munlockall2 516
#endif /* _UAPI_ALPHA_UNISTD_H */
diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 9b62e3fd4f03..04d1cceafdab 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -532,6 +532,9 @@ sys_call_table:
.quad sys_getrandom
.quad sys_memfd_create
.quad sys_execveat
+ .quad sys_mlock2
+ .quad sys_munlock2 /* 515 */
+ .quad sys_munlockall2
.size sys_call_table, . - sys_call_table
.type sys_call_table, @object
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a3df02918643..7d343120f592 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1999,6 +1999,7 @@ config KEXEC
bool "Kexec system call (EXPERIMENTAL)"
depends on (!SMP || PM_SLEEP_SMP)
depends on !CPU_V7M
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..6d6012a320b2 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 32640c431a08..7cba573c2cc9 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -19,7 +19,7 @@
* This may need to be greater than __NR_last_syscall+1 in order to
* account for the padding in the syscall table
*/
-#define __NR_syscalls (388)
+#define __NR_syscalls (392)
/*
* *NOTE*: This is a ghost syscall private to the kernel. Only the
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 0c3f5a0dafd3..46eaf405e6b4 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -414,6 +414,9 @@
#define __NR_memfd_create (__NR_SYSCALL_BASE+385)
#define __NR_bpf (__NR_SYSCALL_BASE+386)
#define __NR_execveat (__NR_SYSCALL_BASE+387)
+#define __NR_mlock2 (__NR_SYSCALL_BASE+388)
+#define __NR_munlock2 (__NR_SYSCALL_BASE+389)
+#define __NR_munlockall2 (__NR_SYSCALL_BASE+390)
/*
* The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 05745eb838c5..88808221383b 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -397,6 +397,9 @@
/* 385 */ CALL(sys_memfd_create)
CALL(sys_bpf)
CALL(sys_execveat)
+ CALL(sys_mlock2)
+ CALL(sys_munlock2)
+/* 390 */ CALL(sys_munlockall2)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index e24df77abd79..e65e9dbeadff 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
return;
}
- sram_pool = gen_pool_get(&pdev->dev);
+ sram_pool = gen_pool_get(&pdev->dev, NULL);
if (!sram_pool) {
pr_warn("%s: sram pool unavailable!\n", __func__);
return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 56283f8a675c..bd5db28324ba 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -285,10 +285,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3bc498c250dc..aa537a4dc9a8 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
#define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2)
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5)
-#define __NR_compat_syscalls 388
+#define __NR_compat_syscalls 391
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index cef934a90f17..318072aa065a 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -797,3 +797,9 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 387
__SYSCALL(__NR_execveat, compat_sys_execveat)
+#define __NR_mlock2 388
+__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_munlock2 389
+__SYSCALL(__NR_munlock2, sys_munlock2)
+#define __NR_munlockall2 390
+__SYSCALL(__NR_munlockall2, sys_munlockall2)
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 1670f15ef69e..948f0ad2de23 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -168,7 +168,8 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
* Other callers might not initialize the si_lsb field,
* so check explicitely for the right codes here.
*/
- if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
break;
@@ -201,8 +202,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
- memset(to, 0, sizeof *to);
-
if (copy_from_user(to, from, __ARCH_SI_PREAMBLE_SIZE) ||
copy_from_user(to->_sifields._pad,
from->_sifields._pad, SI_PAD_SIZE))
diff --git a/arch/avr32/include/uapi/asm/unistd.h b/arch/avr32/include/uapi/asm/unistd.h
index bbe2fba565cd..e6a1681cf0d6 100644
--- a/arch/avr32/include/uapi/asm/unistd.h
+++ b/arch/avr32/include/uapi/asm/unistd.h
@@ -333,5 +333,8 @@
#define __NR_memfd_create 318
#define __NR_bpf 319
#define __NR_execveat 320
+#define __NR_mlock2 321
+#define __NR_munlock2 322
+#define __NR_munlockall2 323
#endif /* _UAPI__ASM_AVR32_UNISTD_H */
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index c3b593bfc3b3..83928abd13a5 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -334,4 +334,7 @@ sys_call_table:
.long sys_memfd_create
.long sys_bpf
.long sys_execveat /* 320 */
+ .long sys_mlock2
+ .long sys_munlock2
+ .long sys_munlockall2
.long sys_ni_syscall /* r8 is saturated at nr_syscalls */
diff --git a/arch/blackfin/include/uapi/asm/unistd.h b/arch/blackfin/include/uapi/asm/unistd.h
index 0cb9078ef482..37c0362fa388 100644
--- a/arch/blackfin/include/uapi/asm/unistd.h
+++ b/arch/blackfin/include/uapi/asm/unistd.h
@@ -433,6 +433,9 @@
#define __IGNORE_munlock
#define __IGNORE_mlockall
#define __IGNORE_munlockall
+#define __IGNORE_mlock2
+#define __IGNORE_munlock2
+#define __IGNORE_munlockall2
#define __IGNORE_mincore
#define __IGNORE_madvise
#define __IGNORE_remap_file_pages
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 8d9431e22e8c..5d83587d8be6 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1704,6 +1704,9 @@ ENTRY(_sys_call_table)
.long _sys_memfd_create /* 390 */
.long _sys_bpf
.long _sys_execveat
+ .long _sys_mlock2
+ .long _sys_munlock2
+ .long _sys_munlockall2 /* 395 */
.rept NR_syscalls-(.-_sys_call_table)/4
.long _sys_ni_syscall
diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S
index b5622521dad5..2d48fe0c1eb1 100644
--- a/arch/cris/arch-v10/kernel/entry.S
+++ b/arch/cris/arch-v10/kernel/entry.S
@@ -963,6 +963,9 @@ sys_call_table:
.long sys_memfd_create
.long sys_bpf
.long sys_execveat
+ .long sys_mlock2 /* 360 */
+ .long sys_munlock2
+ .long sys_munlockall2
/*
* NOTE!! This doesn't have to be exact - we just have
diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S
index b17a20999f87..64f75340600e 100644
--- a/arch/cris/arch-v32/kernel/entry.S
+++ b/arch/cris/arch-v32/kernel/entry.S
@@ -894,6 +894,9 @@ sys_call_table:
.long sys_memfd_create
.long sys_bpf
.long sys_execveat
+ .long sys_mlock2 /* 360 */
+ .long sys_munlock2
+ .long sys_munlockall2
/*
* NOTE!! This doesn't have to be exact - we just have
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index dfcd263c0517..ee605a03a467 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1515,5 +1515,8 @@ sys_call_table:
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_setns
+ .long sys_mlock2
+ .long sys_munlock2
+ .long sys_munlockall2 /* 340 */
syscall_table_size = (. - sys_call_table)
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 42a91a7aa2b0..eb0249e37981 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
config KEXEC
bool "kexec system call"
depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index 95c39b95e97e..db73390568c8 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -11,7 +11,7 @@
-#define NR_syscalls 319 /* length of syscall table */
+#define NR_syscalls 322 /* length of syscall table */
/*
* The following defines stop scripts/checksyscalls.sh from complaining about
diff --git a/arch/ia64/include/uapi/asm/unistd.h b/arch/ia64/include/uapi/asm/unistd.h
index 461079560c78..5f485cc4db94 100644
--- a/arch/ia64/include/uapi/asm/unistd.h
+++ b/arch/ia64/include/uapi/asm/unistd.h
@@ -332,5 +332,8 @@
#define __NR_memfd_create 1340
#define __NR_bpf 1341
#define __NR_execveat 1342
+#define __NR_mlock2 1343
+#define __NR_munlock2 1344
+#define __NR_munlockall2 1345
#endif /* _UAPI_ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index ae0de7bf5525..3ef4457bd246 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1768,5 +1768,8 @@ sys_call_table:
data8 sys_memfd_create // 1340
data8 sys_bpf
data8 sys_execveat
+ data8 sys_mlock2
+ data8 sys_munlock2
+ data8 sys_munlockall2 // 1345
.org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
diff --git a/arch/m32r/kernel/entry.S b/arch/m32r/kernel/entry.S
index c639bfa32232..4f7f2e2280c0 100644
--- a/arch/m32r/kernel/entry.S
+++ b/arch/m32r/kernel/entry.S
@@ -76,6 +76,9 @@
#define sys_munlock sys_ni_syscall
#define sys_mlockall sys_ni_syscall
#define sys_munlockall sys_ni_syscall
+#define sys_mlock2 sys_ni_syscall
+#define sys_munlock2 sys_ni_syscall
+#define sys_munlockall2 sys_ni_syscall
#define sys_mremap sys_ni_syscall
#define sys_mincore sys_ni_syscall
#define sys_remap_file_pages sys_ni_syscall
diff --git a/arch/m32r/kernel/syscall_table.S b/arch/m32r/kernel/syscall_table.S
index f365c19795ef..9918c3e20248 100644
--- a/arch/m32r/kernel/syscall_table.S
+++ b/arch/m32r/kernel/syscall_table.S
@@ -325,3 +325,6 @@ ENTRY(sys_call_table)
.long sys_eventfd
.long sys_fallocate
.long sys_setns /* 325 */
+ .long sys_mlock2
+ .long sys_munlock2
+ .long sys_munlockall2
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -95,6 +95,7 @@ config MMU_SUN3
config KEXEC
bool "kexec system call"
depends on M68KCLASSIC
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 244e0dbe45db..b18f3da0b01e 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -4,7 +4,7 @@
#include <uapi/asm/unistd.h>
-#define NR_syscalls 356
+#define NR_syscalls 359
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_OLD_STAT
diff --git a/arch/m68k/include/uapi/asm/unistd.h b/arch/m68k/include/uapi/asm/unistd.h
index 61fb6cb9d2ae..1405c3f5a934 100644
--- a/arch/m68k/include/uapi/asm/unistd.h
+++ b/arch/m68k/include/uapi/asm/unistd.h
@@ -361,5 +361,8 @@
#define __NR_memfd_create 353
#define __NR_bpf 354
#define __NR_execveat 355
+#define __NR_mlock2 356
+#define __NR_munlock2 357
+#define __NR_munlockall2 358
#endif /* _UAPI_ASM_M68K_UNISTD_H_ */
diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S
index a0ec4303f2c8..7963c03cc3e4 100644
--- a/arch/m68k/kernel/syscalltable.S
+++ b/arch/m68k/kernel/syscalltable.S
@@ -376,4 +376,7 @@ ENTRY(sys_call_table)
.long sys_memfd_create
.long sys_bpf
.long sys_execveat /* 355 */
+ .long sys_mlock2
+ .long sys_munlock2
+ .long sys_munlockall2
diff --git a/arch/microblaze/include/uapi/asm/unistd.h b/arch/microblaze/include/uapi/asm/unistd.h
index 32850c73be09..59b06b08ea53 100644
--- a/arch/microblaze/include/uapi/asm/unistd.h
+++ b/arch/microblaze/include/uapi/asm/unistd.h
@@ -404,5 +404,8 @@
#define __NR_memfd_create 386
#define __NR_bpf 387
#define __NR_execveat 388
+#define __NR_mlock2 389 /* ok - nommu or mmu */
+#define __NR_munlock2 390 /* ok - nommu or mmu */
+#define __NR_munlockall2 391 /* ok - nommu or mmu */
#endif /* _UAPI_ASM_MICROBLAZE_UNISTD_H */
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index 29c8568ec55c..6e4b0fec7640 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -389,3 +389,6 @@ ENTRY(sys_call_table)
.long sys_memfd_create
.long sys_bpf
.long sys_execveat
+ .long sys_mlock2
+ .long sys_munlock2 /* 390 */
+ .long sys_munlockall2
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 1cb3000dd2b0..e29ca2f0a5d3 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2592,6 +2592,7 @@ source "kernel/Kconfig.preempt"
config KEXEC
bool "Kexec system call"
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index cfcb876cae6b..925ed63b0fc6 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -48,6 +48,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x100000 /* Lock pages after they are faulted in, do not prefault */
/*
* Flags for msync
@@ -61,12 +62,20 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h
index c03088f9f514..101b884c8881 100644
--- a/arch/mips/include/uapi/asm/unistd.h
+++ b/arch/mips/include/uapi/asm/unistd.h
@@ -377,16 +377,19 @@
#define __NR_memfd_create (__NR_Linux + 354)
#define __NR_bpf (__NR_Linux + 355)
#define __NR_execveat (__NR_Linux + 356)
+#define __NR_mlock2 (__NR_Linux + 357)
+#define __NR_munlock2 (__NR_Linux + 358)
+#define __NR_munlockall2 (__NR_Linux + 359)
/*
* Offset of the last Linux o32 flavoured syscall
*/
-#define __NR_Linux_syscalls 356
+#define __NR_Linux_syscalls 359
#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
#define __NR_O32_Linux 4000
-#define __NR_O32_Linux_syscalls 356
+#define __NR_O32_Linux_syscalls 359
#if _MIPS_SIM == _MIPS_SIM_ABI64
@@ -711,16 +714,19 @@
#define __NR_memfd_create (__NR_Linux + 314)
#define __NR_bpf (__NR_Linux + 315)
#define __NR_execveat (__NR_Linux + 316)
+#define __NR_mlock2 (__NR_Linux + 317)
+#define __NR_munlock2 (__NR_Linux + 318)
+#define __NR_munlockall2 (__NR_Linux + 319)
/*
* Offset of the last Linux 64-bit flavoured syscall
*/
-#define __NR_Linux_syscalls 316
+#define __NR_Linux_syscalls 319
#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
#define __NR_64_Linux 5000
-#define __NR_64_Linux_syscalls 316
+#define __NR_64_Linux_syscalls 319
#if _MIPS_SIM == _MIPS_SIM_NABI32
@@ -1049,15 +1055,18 @@
#define __NR_memfd_create (__NR_Linux + 318)
#define __NR_bpf (__NR_Linux + 319)
#define __NR_execveat (__NR_Linux + 320)
+#define __NR_mlock2 (__NR_Linux + 321)
+#define __NR_munlock2 (__NR_Linux + 322)
+#define __NR_munlockall2 (__NR_Linux + 323)
/*
* Offset of the last N32 flavoured syscall
*/
-#define __NR_Linux_syscalls 320
+#define __NR_Linux_syscalls 323
#endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
#define __NR_N32_Linux 6000
-#define __NR_N32_Linux_syscalls 320
+#define __NR_N32_Linux_syscalls 323
#endif /* _UAPI_ASM_UNISTD_H */
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 4cc13508d967..c409d53d87c7 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -599,3 +599,6 @@ EXPORT(sys_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 4355 */
PTR sys_execveat
+ PTR sys_mlock2
+ PTR sys_munlock2
+ PTR sys_munlockall2
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index ad4d44635c76..0aa2742313c5 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -436,4 +436,7 @@ EXPORT(sys_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 5315 */
PTR sys_execveat
+ PTR sys_mlock2
+ PTR sys_munlock2
+ PTR sys_munlockall2
.size sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 446cc654da56..eb21955e2e81 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -429,4 +429,7 @@ EXPORT(sysn32_call_table)
PTR sys_memfd_create
PTR sys_bpf
PTR compat_sys_execveat /* 6320 */
+ PTR sys_mlock2
+ PTR sys_munlock2
+ PTR sys_munlockall2
.size sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index f543ff4feef9..f45049c30a70 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -584,4 +584,7 @@ EXPORT(sys32_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 4355 */
PTR compat_sys_execveat
+ PTR sys_mlock2
+ PTR sys_munlock2
+ PTR sys_munlockall2
.size sys32_call_table,.-sys32_call_table
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 3059f36bfc89..f7e89524e316 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -235,8 +235,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
- memset(to, 0, sizeof *to);
-
if (copy_from_user(to, from, 3*sizeof(int)) ||
copy_from_user(to->_sifields._pad,
from->_sifields._pad, SI_PAD_SIZE32))
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S
index 177d61de51c9..d34adf5c1ff3 100644
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -767,6 +767,9 @@ ENTRY(sys_call_table)
.long sys_perf_event_open
.long sys_recvmmsg
.long sys_setns
+ .long sys_mlock2 /* 340 */
+ .long sys_munlock2
+ .long sys_munlockall2
nr_syscalls=(.-sys_call_table)/4
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 294d251ca7b2..cf1f7005bdb7 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -24,6 +24,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x100000 /* Lock pages after they are faulted in, do not prefault */
#define MS_SYNC 1 /* synchronous memory sync */
#define MS_ASYNC 2 /* sync memory asynchronously */
@@ -31,6 +32,10 @@
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
@@ -40,6 +45,7 @@
#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
+#define MADV_FREE 8 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/parisc/include/uapi/asm/unistd.h b/arch/parisc/include/uapi/asm/unistd.h
index 2e639d7604f6..455c8a3f79ce 100644
--- a/arch/parisc/include/uapi/asm/unistd.h
+++ b/arch/parisc/include/uapi/asm/unistd.h
@@ -358,8 +358,11 @@
#define __NR_memfd_create (__NR_Linux + 340)
#define __NR_bpf (__NR_Linux + 341)
#define __NR_execveat (__NR_Linux + 342)
+#define __NR_mlock2 (__NR_Linux + 343)
+#define __NR_munlock2 (__NR_Linux + 344)
+#define __NR_munlockall2 (__NR_Linux + 345)
-#define __NR_Linux_syscalls (__NR_execveat + 1)
+#define __NR_Linux_syscalls (__NR_munlockall2 + 1)
#define __IGNORE_select /* newselect */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index fe2f2c595fd9..b1452fa79e6f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -419,6 +419,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
config KEXEC
bool "kexec system call"
depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3bb7488bd24b..42886fc772df 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -507,9 +507,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 71f2b3f02cf8..4d65499ee1c1 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -368,3 +368,4 @@ SYSCALL_SPU(memfd_create)
SYSCALL_SPU(bpf)
COMPAT_SYS(execveat)
PPC64ONLY(switch_endian)
+SYSCALL_SPU(userfaultfd)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f4f8b667d75b..4a055b6c2a64 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
#include <uapi/asm/unistd.h>
-#define __NR_syscalls 364
+#define __NR_syscalls 365
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 6ea26df0a73c..40a3fda54c6e 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -22,10 +22,15 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
+
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x80000 /* Lock pages after they are faulted in, do not prefault */
#endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index e4aa173dae62..c6d8eb113fdf 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -386,5 +386,9 @@
#define __NR_bpf 361
#define __NR_execveat 362
#define __NR_switch_endian 363
+#define __NR_userfaultfd 364
+#define __NR_mlock2 365
+#define __NR_munlock2 366
+#define __NR_munlockall2 367
#endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d3a831ac0f92..da50e0c9c57e 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -966,8 +966,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s)
int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
{
- memset(to, 0, sizeof *to);
-
if (copy_from_user(to, from, 3*sizeof(int)) ||
copy_from_user(to->_sifields._pad,
from->_sifields._pad, SI_PAD_SIZE32))
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 91e8954f1237..45fcf2a6dbaf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KEXEC
def_bool y
+ select KEXEC_CORE
config AUDIT_ARCH
def_bool y
diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h
index 67878af257a0..d1c5b1f1978e 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -290,7 +290,10 @@
#define __NR_s390_pci_mmio_write 352
#define __NR_s390_pci_mmio_read 353
#define __NR_execveat 354
-#define NR_syscalls 355
+#define __NR_mlock2 355
+#define __NR_munlock2 356
+#define __NR_munlockall2 357
+#define NR_syscalls 358
/*
* There are some system calls that are not present on 64 bit, some
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index f8498dde67b1..58339e29844f 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -220,3 +220,6 @@ COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, fla
COMPAT_SYSCALL_WRAP3(bpf, int, cmd, union bpf_attr *, attr, unsigned int, size);
COMPAT_SYSCALL_WRAP3(s390_pci_mmio_write, const unsigned long, mmio_addr, const void __user *, user_buffer, const size_t, length);
COMPAT_SYSCALL_WRAP3(s390_pci_mmio_read, const unsigned long, mmio_addr, void __user *, user_buffer, const size_t, length);
+COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags);
+COMPAT_SYSCALL_WRAP3(munlock2, unsigned long, start, size_t, len, int, flags);
+COMPAT_SYSCALL_WRAP1(munlockall2, int, flags);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..d514df7e04dd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
depends on SUPERH32 && MMU
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 734234be2f01..6d0786791ff0 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -386,3 +386,6 @@ ENTRY(sys_call_table)
.long sys_process_vm_writev
.long sys_kcmp
.long sys_finit_module
+ .long sys_mlock2
+ .long sys_munlock2 /* 370 */
+ .long sys_munlockall2
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..5833dc5ee7d7 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -717,6 +717,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return __pmd(pte_val(pte));
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ pte_t pte = __pte(pmd_val(pmd));
+
+ pte = pte_mkclean(pte);
+
+ return __pmd(pte_val(pte));
+}
+
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h
index 0b14df33cffa..3d74ab727902 100644
--- a/arch/sparc/include/uapi/asm/mman.h
+++ b/arch/sparc/include/uapi/asm/mman.h
@@ -17,11 +17,16 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
+
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x8000 /* Lock pages after they are faulted in, do not prefault */
#endif /* _UAPI__SPARC_MMAN_H__ */
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 6f35f4df17f2..c25bbb13e77d 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -416,8 +416,11 @@
#define __NR_memfd_create 348
#define __NR_bpf 349
#define __NR_execveat 350
+#define __NR_mlock2 351
+#define __NR_munlock2 352
+#define __NR_munlockall2 353
-#define NR_syscalls 351
+#define NR_syscalls 354
/* Bitmask values returned from kern_features system call. */
#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index e31a9056a303..72b68d49a514 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -87,4 +87,4 @@ sys_call_table:
/*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
/*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
/*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .long sys_execveat
+/*350*/ .long sys_execveat, sys_mlock2, sys_munlock2, sys_munlockall2
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index d72f76ae70eb..a96bfea6edf6 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -88,7 +88,7 @@ sys_call_table32:
.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .word sys32_execveat
+/*350*/ .word sys32_execveat, sys_mlock2, sys_munlock2, sys_munlockall2
#endif /* CONFIG_COMPAT */
@@ -168,4 +168,4 @@ sys_call_table:
.word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .word sys64_execveat
+/*350*/ .word sys64_execveat, sys_mlock2, sys_munlock2, sys_munlockall2
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 9def1f52d03a..fe7e95828f8d 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -204,6 +204,7 @@ source "kernel/Kconfig.hz"
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/tile/include/uapi/asm/mman.h b/arch/tile/include/uapi/asm/mman.h
index 81b8fc348d63..800e5c35314b 100644
--- a/arch/tile/include/uapi/asm/mman.h
+++ b/arch/tile/include/uapi/asm/mman.h
@@ -29,6 +29,7 @@
#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
#define MAP_HUGETLB 0x4000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x100000 /* Lock pages after they are faulted in, do not prefault */
/*
@@ -36,6 +37,14 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#endif /* _ASM_TILE_MMAN_H */
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index e8c2c04143cd..c667e104a0c2 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -113,8 +113,6 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
if (!access_ok(VERIFY_READ, from, sizeof(struct compat_siginfo)))
return -EFAULT;
- memset(to, 0, sizeof(*to));
-
err = __get_user(to->si_signo, &from->si_signo);
err |= __get_user(to->si_errno, &from->si_errno);
err |= __get_user(to->si_code, &from->si_code);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6c80c03d8e90..632d7c2e8286 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
@@ -1724,6 +1725,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1740,8 +1742,8 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
+ select KEXEC_CORE
select BUILD_BIN2C
- depends on KEXEC
depends on X86_64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
# define XLF23 0
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
# define XLF4 XLF_EFI_KEXEC
#else
# define XLF4 0
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..d68b13925aa4 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -365,3 +365,7 @@
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
+359 i386 userfaultfd sys_userfaultfd
+360 i386 mlock2 sys_mlock2
+361 i386 munlock2 sys_munlock2
+362 i386 munlockall2 sys_munlockall2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..5a5a5258f741 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,11 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common userfaultfd sys_userfaultfd
+324 common mlock2 sys_mlock2
+325 common munlock2 sys_munlock2
+326 common munlockall2 sys_munlockall2
+
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
extern int in_crash_kexec;
#else
/* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 867da5bbb4a3..b964d54300e1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
#endif /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() { \
+ inc_irq_stat(irq_tlb_count); \
+ local_flush_tlb(); \
+}
+
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 523101c4780e..c4559a5e6104 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -70,8 +70,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
* kind of shutdown from our side, we unregister the clock by writting anything
* that does not have the 'enable' bit set in the msr
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..22db575a2fec 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -703,7 +703,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874bf999e..1c0faa76c26a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -498,7 +498,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4bfb36e78942..0dbeec1cfe1d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This bitmap is used to indicate whether the vmclear
* operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -10481,7 +10481,7 @@ static int __init vmx_init(void)
if (r)
return r;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
@@ -10491,7 +10491,7 @@ static int __init vmx_init(void)
static void __exit vmx_exit(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty block */
- if (bi->start >= bi->end)
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
numa_remove_memblk_from(i--, mi);
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3250f2371aea..2da824c1c140 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_end = end;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
if (is_uv_system()) {
unsigned int cpu;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cfba30f27392..f1e54193350b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
static void __init save_runtime_map(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *tmp, *p, *q = NULL;
int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
static void __init kexec_enter_virtual_mode(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *p;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
uv_nmi_sync_exit(0);
}
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
}
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 201aec0e0446..8146e3679845 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -55,6 +55,7 @@
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x100000 /* Lock pages after they are faulted in, do not prefault */
#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
@@ -74,12 +75,20 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/xtensa/include/uapi/asm/unistd.h b/arch/xtensa/include/uapi/asm/unistd.h
index b95c30594355..fbd0876a6d2c 100644
--- a/arch/xtensa/include/uapi/asm/unistd.h
+++ b/arch/xtensa/include/uapi/asm/unistd.h
@@ -753,8 +753,14 @@ __SYSCALL(339, sys_memfd_create, 2)
__SYSCALL(340, sys_bpf, 3)
#define __NR_execveat 341
__SYSCALL(341, sys_execveat, 5)
-
-#define __NR_syscall_count 342
+#define __NR_mlock2 342
+__SYSCALL(342, sys_mlock2, 3)
+#define __NR_munlock2 343
+__SYSCALL(343, sys_munlock2, 3)
+#define __NR_munlockall2 344
+__SYSCALL(344, sys_munlock2, 1)
+
+#define __NR_syscall_count 345
/*
* sysxtensa syscall handler
diff --git a/block/genhd.c b/block/genhd.c
index 0c706f33a599..3213b66515f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f439ad2800da..6492f4d53bea 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
- unsigned long nr_migrated;
struct zram *zram = dev_to_zram(dev);
struct zram_meta *meta;
@@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
}
meta = zram->meta;
- nr_migrated = zs_compact(meta->mem_pool);
- atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ zs_compact(meta->mem_pool);
up_read(&zram->init_lock);
return len;
@@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
+ struct zs_pool_stats pool_stats;
u64 orig_size, mem_used = 0;
long max_used;
ssize_t ret;
+ memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
down_read(&zram->init_lock);
- if (init_done(zram))
+ if (init_done(zram)) {
mem_used = zs_get_total_pages(zram->meta->mem_pool);
+ zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ }
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.zero_pages),
- (u64)atomic64_read(&zram->stats.num_migrated));
+ pool_stats.pages_compacted);
up_read(&zram->init_lock);
return ret;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6dbe2df506bf..8e92339686d7 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,7 +78,6 @@ struct zram_stats {
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
- atomic64_t num_migrated; /* no. of migrated object */
atomic64_t failed_reads; /* can happen when memory is too low */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
diff --git a/drivers/crypto/qat/qat_common/adf_transport_debug.c b/drivers/crypto/qat/qat_common/adf_transport_debug.c
index e41986967294..52340b9bb387 100644
--- a/drivers/crypto/qat/qat_common/adf_transport_debug.c
+++ b/drivers/crypto/qat/qat_common/adf_transport_debug.c
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
{
struct adf_etr_ring_data *ring = sfile->private;
struct adf_etr_bank_data *bank = ring->bank;
- uint32_t *msg = v;
void __iomem *csr = ring->bank->csr_addr;
- int i, x;
if (v == SEQ_START_TOKEN) {
int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
seq_puts(sfile, "----------- Ring data ------------\n");
return 0;
}
- seq_printf(sfile, "%p:", msg);
- x = 0;
- i = 0;
- for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
- seq_printf(sfile, " %08X", *(msg + i));
- if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
- (++x == 8)) {
- seq_printf(sfile, "\n%p:", msg + i + 1);
- x = 0;
- }
- }
- seq_puts(sfile, "\n");
+ seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
+ v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
return 0;
}
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 54071c148340..84533e02fbf8 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
config EFI_RUNTIME_MAP
bool "Export efi runtime maps to sysfs"
- depends on X86 && EFI && KEXEC
+ depends on X86 && EFI && KEXEC_CORE
default y
help
Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index aab49ee4ed40..dfbcfc2f4b53 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data)
(void *)(unsigned long)virt_to_phys(high_memory));
list_for_each_entry(pt, &dev->vmalist, head) {
+ char lock_flag = '-';
+
vma = pt->vma;
if (!vma)
continue;
+ if (vma->vm_flags & VM_LOCKED)
+ lock_flag = 'l';
+ else if (vma->vm_flags & VM_LOCKONFAULT)
+ lock_flag = 'f';
seq_printf(m,
"\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000",
pt->pid,
@@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data)
vma->vm_flags & VM_WRITE ? 'w' : '-',
vma->vm_flags & VM_EXEC ? 'x' : '-',
vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
- vma->vm_flags & VM_LOCKED ? 'l' : '-',
+ lock_flag,
vma->vm_flags & VM_IO ? 'i' : '-',
vma->vm_pgoff);
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 58f65486de33..284ac4c934ba 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
/* Get IRAM pool from device tree or platform data */
pool = of_gen_pool_get(np, "iram", 0);
if (!pool && pdata)
- pool = gen_pool_get(pdata->iram_dev);
+ pool = gen_pool_get(pdata->iram_dev, NULL);
if (!pool) {
dev_err(&pdev->dev, "iram pool not available\n");
return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
if (IS_ERR(sram->virt_base))
return PTR_ERR(sram->virt_base);
- sram->pool = devm_gen_pool_create(sram->dev,
- ilog2(SRAM_GRANULARITY), -1);
- if (!sram->pool)
- return -ENOMEM;
+ sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
+ NUMA_NO_NODE, NULL);
+ if (IS_ERR(sram->pool))
+ return PTR_ERR(sram->pool);
ret = sram_reserve_regions(sram, res);
if (ret)
diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 75219a1b8805..7f07cf5cd401 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
.llseek = seq_lseek,
};
+static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
+ const char *prefix)
+{
+ seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
+}
+
static void wil_print_ring(struct seq_file *s, const char *prefix,
void __iomem *off)
{
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
le16_to_cpu(hdr.seq), len,
le16_to_cpu(hdr.type), hdr.flags);
if (len <= MAX_MBOXITEM_SIZE) {
- int n = 0;
- char printbuf[16 * 3 + 2];
unsigned char databuf[MAX_MBOXITEM_SIZE];
void __iomem *src = wmi_buffer(wil, d.addr) +
sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
* reading header
*/
wil_memcpy_fromio_32(databuf, src, len);
- while (n < len) {
- int l = min(len - n, 16);
-
- hex_dump_to_buffer(databuf + n, l,
- 16, 1, printbuf,
- sizeof(printbuf),
- false);
- seq_printf(s, " : %s\n", printbuf);
- n += l;
- }
+ wil_seq_hexdump(s, databuf, len, " : ");
}
} else {
seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
.open = simple_open,
};
-static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
- const char *prefix)
-{
- char printbuf[16 * 3 + 2];
- int i = 0;
-
- while (i < len) {
- int l = min(len - i, 16);
-
- hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
- sizeof(printbuf), false);
- seq_printf(s, "%s%s\n", prefix, printbuf);
- i += l;
- }
-}
-
static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
{
int i = 0;
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 02ff84fcfa61..957b42198328 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
struct ioc *ioc = ioc_list;
while (ioc != NULL) {
- u32 *res_ptr = (u32 *)ioc->res_map;
- int j;
-
- for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
- if ((j & 7) == 0)
- seq_puts(m, "\n ");
- seq_printf(m, "%08x", *res_ptr);
- res_ptr++;
- }
- seq_puts(m, "\n\n");
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+ ioc->res_size, false);
+ seq_putc(m, '\n');
ioc = ioc->next;
break; /* XXX - remove me */
}
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index f1441e466c06..225049b492e5 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
{
struct sba_device *sba_dev = sba_list;
struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */
- unsigned int *res_ptr = (unsigned int *)ioc->res_map;
- int i;
- for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
- if ((i & 7) == 0)
- seq_puts(m, "\n ");
- seq_printf(m, " %08x", *res_ptr);
- }
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+ ioc->res_size, false);
seq_putc(m, '\n');
return 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 38a602cb9fb7..d4f20bfe4586 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -451,7 +451,7 @@ static void pci_device_shutdown(struct device *dev)
if (drv && drv->shutdown)
drv->shutdown(pci_dev);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* If this is a kexec reboot, turn off Bus Master bit on the
* device to tell it to not continue to do DMA. Don't touch
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 01bf1f5cf2e9..4eb45546a3aa 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
static void sprinthx4(unsigned char *title, struct seq_file *m,
unsigned int *array, unsigned int len)
{
- int r;
-
seq_printf(m, "\n%s\n", title);
- for (r = 0; r < len; r++) {
- if ((r % 8) == 0)
- seq_printf(m, " ");
- seq_printf(m, "%08X ", array[r]);
- if ((r % 8) == 7)
- seq_putc(m, '\n');
- }
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
seq_putc(m, '\n');
}
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b5b427888b24..95b330a9ea98 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,16 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = -1,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/drivers/w1/masters/omap_hdq.c b/drivers/w1/masters/omap_hdq.c
index e7d448963a24..0e2f43bccf1f 100644
--- a/drivers/w1/masters/omap_hdq.c
+++ b/drivers/w1/masters/omap_hdq.c
@@ -17,6 +17,7 @@
#include <linux/io.h>
#include <linux/sched.h>
#include <linux/pm_runtime.h>
+#include <linux/of.h>
#include "../w1.h"
#include "../w1_int.h"
@@ -27,21 +28,23 @@
#define OMAP_HDQ_TX_DATA 0x04
#define OMAP_HDQ_RX_DATA 0x08
#define OMAP_HDQ_CTRL_STATUS 0x0c
-#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK (1<<6)
-#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE (1<<5)
-#define OMAP_HDQ_CTRL_STATUS_GO (1<<4)
-#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION (1<<2)
-#define OMAP_HDQ_CTRL_STATUS_DIR (1<<1)
-#define OMAP_HDQ_CTRL_STATUS_MODE (1<<0)
+#define OMAP_HDQ_CTRL_STATUS_SINGLE BIT(7)
+#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK BIT(6)
+#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE BIT(5)
+#define OMAP_HDQ_CTRL_STATUS_GO BIT(4)
+#define OMAP_HDQ_CTRL_STATUS_PRESENCE BIT(3)
+#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION BIT(2)
+#define OMAP_HDQ_CTRL_STATUS_DIR BIT(1)
#define OMAP_HDQ_INT_STATUS 0x10
-#define OMAP_HDQ_INT_STATUS_TXCOMPLETE (1<<2)
-#define OMAP_HDQ_INT_STATUS_RXCOMPLETE (1<<1)
-#define OMAP_HDQ_INT_STATUS_TIMEOUT (1<<0)
+#define OMAP_HDQ_INT_STATUS_TXCOMPLETE BIT(2)
+#define OMAP_HDQ_INT_STATUS_RXCOMPLETE BIT(1)
+#define OMAP_HDQ_INT_STATUS_TIMEOUT BIT(0)
#define OMAP_HDQ_SYSCONFIG 0x14
-#define OMAP_HDQ_SYSCONFIG_SOFTRESET (1<<1)
-#define OMAP_HDQ_SYSCONFIG_AUTOIDLE (1<<0)
+#define OMAP_HDQ_SYSCONFIG_SOFTRESET BIT(1)
+#define OMAP_HDQ_SYSCONFIG_AUTOIDLE BIT(0)
+#define OMAP_HDQ_SYSCONFIG_NOIDLE 0x0
#define OMAP_HDQ_SYSSTATUS 0x18
-#define OMAP_HDQ_SYSSTATUS_RESETDONE (1<<0)
+#define OMAP_HDQ_SYSSTATUS_RESETDONE BIT(0)
#define OMAP_HDQ_FLAG_CLEAR 0
#define OMAP_HDQ_FLAG_SET 1
@@ -67,6 +70,10 @@ struct hdq_data {
* the data wrire or read.
*/
int init_trans;
+ int rrw;
+ /* mode: 0-HDQ 1-W1 */
+ int mode;
+
};
static int omap_hdq_probe(struct platform_device *pdev);
@@ -74,6 +81,7 @@ static int omap_hdq_remove(struct platform_device *pdev);
static const struct of_device_id omap_hdq_dt_ids[] = {
{ .compatible = "ti,omap3-1w" },
+ { .compatible = "ti,am4372-hdq" },
{}
};
MODULE_DEVICE_TABLE(of, omap_hdq_dt_ids);
@@ -90,15 +98,12 @@ static struct platform_driver omap_hdq_driver = {
static u8 omap_w1_read_byte(void *_hdq);
static void omap_w1_write_byte(void *_hdq, u8 byte);
static u8 omap_w1_reset_bus(void *_hdq);
-static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
- u8 search_type, w1_slave_found_callback slave_found);
static struct w1_bus_master omap_w1_master = {
.read_byte = omap_w1_read_byte,
.write_byte = omap_w1_write_byte,
.reset_bus = omap_w1_reset_bus,
- .search = omap_w1_search_bus,
};
/* HDQ register I/O routines */
@@ -122,6 +127,15 @@ static inline u8 hdq_reg_merge(struct hdq_data *hdq_data, u32 offset,
return new_val;
}
+static void hdq_disable_interrupt(struct hdq_data *hdq_data, u32 offset,
+ u32 mask)
+{
+ u32 ie;
+
+ ie = readl(hdq_data->hdq_base + offset);
+ writel(ie & mask, hdq_data->hdq_base + offset);
+}
+
/*
* Wait for one or more bits in flag change.
* HDQ_FLAG_SET: wait until any bit in the flag is set.
@@ -229,13 +243,7 @@ static irqreturn_t hdq_isr(int irq, void *_hdq)
return IRQ_HANDLED;
}
-/* HDQ Mode: always return success */
-static u8 omap_w1_reset_bus(void *_hdq)
-{
- return 0;
-}
-
-/* W1 search callback function */
+/* W1 search callback function in HDQ mode */
static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
u8 search_type, w1_slave_found_callback slave_found)
{
@@ -262,9 +270,10 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
int ret;
u8 tmp_status;
- hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG, OMAP_HDQ_SYSCONFIG_SOFTRESET);
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_SOFTRESET);
/*
- * Select HDQ mode & enable clocks.
+ * Select HDQ/1W mode & enable clocks.
* It is observed that INT flags can't be cleared via a read and GO/INIT
* won't return to zero if interrupt is disabled. So we always enable
* interrupt.
@@ -282,7 +291,8 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
else {
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
OMAP_HDQ_SYSCONFIG_AUTOIDLE);
}
@@ -334,6 +344,18 @@ static int omap_hdq_break(struct hdq_data *hdq_data)
ret = -ETIMEDOUT;
goto out;
}
+
+ /*
+ * check for the presence detect bit to get
+ * set to show that the slave is responding
+ */
+ if (!(hdq_reg_in(hdq_data, OMAP_HDQ_CTRL_STATUS) &
+ OMAP_HDQ_CTRL_STATUS_PRESENCE)) {
+ dev_dbg(hdq_data->dev, "Presence bit not set\n");
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
/*
* wait for both INIT and GO bits rerurn to zero.
* zero wait time expected for interrupt mode.
@@ -368,6 +390,8 @@ static int hdq_read_byte(struct hdq_data *hdq_data, u8 *val)
goto out;
}
+ hdq_data->hdq_irqstatus = 0;
+
if (!(hdq_data->hdq_irqstatus & OMAP_HDQ_INT_STATUS_RXCOMPLETE)) {
hdq_reg_merge(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_DIR | OMAP_HDQ_CTRL_STATUS_GO,
@@ -400,7 +424,7 @@ rtn:
}
-/* Enable clocks and set the controller to HDQ mode */
+/* Enable clocks and set the controller to HDQ/1W mode */
static int omap_hdq_get(struct hdq_data *hdq_data)
{
int ret = 0;
@@ -422,7 +446,7 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
pm_runtime_get_sync(hdq_data->dev);
- /* make sure HDQ is out of reset */
+ /* make sure HDQ/1W is out of reset */
if (!(hdq_reg_in(hdq_data, OMAP_HDQ_SYSSTATUS) &
OMAP_HDQ_SYSSTATUS_RESETDONE)) {
ret = _omap_hdq_reset(hdq_data);
@@ -430,12 +454,13 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
/* back up the count */
hdq_data->hdq_usecount--;
} else {
- /* select HDQ mode & enable clocks */
+ /* select HDQ/1W mode & enable clocks */
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
- OMAP_HDQ_SYSCONFIG_AUTOIDLE);
+ OMAP_HDQ_SYSCONFIG_NOIDLE);
hdq_reg_in(hdq_data, OMAP_HDQ_INT_STATUS);
}
}
@@ -456,6 +481,8 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
if (ret < 0)
return -EINTR;
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_AUTOIDLE);
if (0 == hdq_data->hdq_usecount) {
dev_dbg(hdq_data->dev, "attempt to decrement use count"
" when it is zero");
@@ -471,6 +498,100 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
return ret;
}
+/*
+ * W1 triplet callback function - used for searching ROM addresses.
+ * Registered only when controller is in 1-wire mode.
+ */
+static u8 omap_w1_triplet(void *_hdq, u8 bdir)
+{
+ u8 id_bit, comp_bit;
+ int err;
+ u8 ret = 0x3; /* no slaves responded */
+ struct hdq_data *hdq_data = _hdq;
+ u8 ctrl = OMAP_HDQ_CTRL_STATUS_SINGLE | OMAP_HDQ_CTRL_STATUS_GO |
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK;
+ u8 mask = ctrl | OMAP_HDQ_CTRL_STATUS_DIR;
+
+ omap_hdq_get(_hdq);
+
+ err = mutex_lock_interruptible(&hdq_data->hdq_mutex);
+ if (err < 0) {
+ dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
+ goto rtn;
+ }
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read id_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ id_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read comp_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ comp_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ if (id_bit && comp_bit) {
+ ret = 0x03; /* no slaves responded */
+ goto out;
+ }
+ if (!id_bit && !comp_bit) {
+ /* Both bits are valid, take the direction given */
+ ret = bdir ? 0x04 : 0;
+ } else {
+ /* Only one bit is valid, take that direction */
+ bdir = id_bit;
+ ret = id_bit ? 0x05 : 0x02;
+ }
+
+ /* write bdir bit */
+ hdq_reg_out(_hdq, OMAP_HDQ_TX_DATA, bdir);
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, ctrl, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_TXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "TX wait elapsed\n");
+ goto out;
+ }
+
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, 0,
+ OMAP_HDQ_CTRL_STATUS_SINGLE);
+
+out:
+ mutex_unlock(&hdq_data->hdq_mutex);
+rtn:
+ omap_hdq_put(_hdq);
+ return ret;
+}
+
+/* reset callback */
+static u8 omap_w1_reset_bus(void *_hdq)
+{
+ omap_hdq_get(_hdq);
+ omap_hdq_break(_hdq);
+ omap_hdq_put(_hdq);
+ return 0;
+}
+
/* Read a byte of data from the device */
static u8 omap_w1_read_byte(void *_hdq)
{
@@ -478,6 +599,10 @@ static u8 omap_w1_read_byte(void *_hdq)
u8 val = 0;
int ret;
+ /* First write to initialize the transfer */
+ if (hdq_data->init_trans == 0)
+ omap_hdq_get(hdq_data);
+
ret = hdq_read_byte(hdq_data, &val);
if (ret) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -491,6 +616,10 @@ static u8 omap_w1_read_byte(void *_hdq)
return -1;
}
+ hdq_disable_interrupt(hdq_data, OMAP_HDQ_CTRL_STATUS,
+ ~OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ hdq_data->hdq_usecount = 0;
+
/* Write followed by a read, release the module */
if (hdq_data->init_trans) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -517,6 +646,14 @@ static void omap_w1_write_byte(void *_hdq, u8 byte)
if (hdq_data->init_trans == 0)
omap_hdq_get(hdq_data);
+ /*
+ * We need to reset the slave before
+ * issuing the SKIP ROM command, else
+ * the slave will not work.
+ */
+ if (byte == W1_SKIP_ROM)
+ omap_hdq_break(hdq_data);
+
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
if (ret < 0) {
dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
@@ -551,6 +688,7 @@ static int omap_hdq_probe(struct platform_device *pdev)
struct resource *res;
int ret, irq;
u8 rev;
+ const char *mode;
hdq_data = devm_kzalloc(dev, sizeof(*hdq_data), GFP_KERNEL);
if (!hdq_data) {
@@ -567,10 +705,21 @@ static int omap_hdq_probe(struct platform_device *pdev)
return PTR_ERR(hdq_data->hdq_base);
hdq_data->hdq_usecount = 0;
+ hdq_data->rrw = 0;
mutex_init(&hdq_data->hdq_mutex);
pm_runtime_enable(&pdev->dev);
- pm_runtime_get_sync(&pdev->dev);
+ ret = pm_runtime_get_sync(&pdev->dev);
+ if (ret < 0) {
+ dev_dbg(&pdev->dev, "pm_runtime_get_sync failed\n");
+ goto err_w1;
+ }
+
+ ret = _omap_hdq_reset(hdq_data);
+ if (ret) {
+ dev_dbg(&pdev->dev, "reset failed\n");
+ return -EINVAL;
+ }
rev = hdq_reg_in(hdq_data, OMAP_HDQ_REVISION);
dev_info(&pdev->dev, "OMAP HDQ Hardware Rev %c.%c. Driver in %s mode\n",
@@ -594,6 +743,15 @@ static int omap_hdq_probe(struct platform_device *pdev)
pm_runtime_put_sync(&pdev->dev);
+ ret = of_property_read_string(pdev->dev.of_node, "ti,mode", &mode);
+ if (ret < 0 || !strcmp(mode, "hdq")) {
+ hdq_data->mode = 0;
+ omap_w1_master.search = omap_w1_search_bus;
+ } else {
+ hdq_data->mode = 1;
+ omap_w1_master.triplet = omap_w1_triplet;
+ }
+
omap_w1_master.data = hdq_data;
ret = w1_add_master_device(&omap_w1_master);
@@ -635,8 +793,8 @@ static int omap_hdq_remove(struct platform_device *pdev)
module_platform_driver(omap_hdq_driver);
module_param(w1_id, int, S_IRUSR);
-MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection");
+MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection in HDQ mode");
MODULE_AUTHOR("Texas Instruments");
-MODULE_DESCRIPTION("HDQ driver Library");
+MODULE_DESCRIPTION("HDQ-1W driver Library");
MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8aa56bb6e861..41a93bd60c46 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -116,7 +116,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
substring_t args[MAX_OPT_ARGS];
char *p;
int option = 0;
- char *s, *e;
+ char *s;
int ret = 0;
/* setup defaults */
@@ -269,8 +269,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
} else {
uid_t uid;
v9ses->flags |= V9FS_ACCESS_SINGLE;
- uid = simple_strtoul(s, &e, 10);
- if (*e != '\0') {
+ ret = parse_integer(s, 10, &uid);
+ if (ret < 0)
+ return ret;
+ if (s[ret] != '\0') {
ret = -EINVAL;
pr_info("Unknown access argument %s\n",
s);
diff --git a/fs/Makefile b/fs/Makefile
index 09e051fefc5b..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
+ struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
return res;
}
+static const struct vm_operations_struct aio_ring_vm_ops = {
+ .mremap = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_ops = &aio_ring_vm_ops;
+ return 0;
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
- .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37847..75df4264176d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -47,7 +47,7 @@ enum {Enabled, Magic};
typedef struct {
struct list_head list;
unsigned long flags; /* type, status, etc. */
- int offset; /* offset of magic */
+ unsigned int offset; /* offset of magic */
int size; /* size of magic/mask */
char *magic; /* magic or filename extension */
char *mask; /* mask, NULL for exact match */
@@ -370,7 +370,13 @@ static Node *create_entry(const char __user *buffer, size_t count)
if (!s)
goto einval;
*s++ = '\0';
- e->offset = simple_strtoul(p, &p, 10);
+ err = parse_integer(p, 10, &e->offset);
+ if (err < 0) {
+ kfree(e);
+ goto out;
+
+ }
+ p += err;
if (*p++)
goto einval;
pr_debug("register: offset: %#x\n", e->offset);
@@ -548,7 +554,7 @@ static void entry_status(Node *e, char *page)
if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
} else {
- dp += sprintf(dp, "offset %i\nmagic ", e->offset);
+ dp += sprintf(dp, "offset %u\nmagic ", e->offset);
dp = bin2hex(dp, e->magic, e->size);
if (e->mask) {
dp += sprintf(dp, "\nmask ");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 198243717da5..9be2d7eda3c3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def05bdf..cc1a9210f1a6 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -326,14 +326,15 @@ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
*/
static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
{
- unsigned long frun;
+ unsigned int frun;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- frun = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &frun);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -350,14 +351,15 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
{
- unsigned long fcull;
+ unsigned int fcull;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- fcull = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &fcull);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -374,14 +376,15 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
{
- unsigned long fstop;
+ unsigned int fstop;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- fstop = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &fstop);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -398,14 +401,15 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
{
- unsigned long brun;
+ unsigned int brun;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- brun = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &brun);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -422,14 +426,15 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
{
- unsigned long bcull;
+ unsigned int bcull;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- bcull = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &bcull);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -446,14 +451,15 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
{
- unsigned long bstop;
+ unsigned int bstop;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- bstop = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &bstop);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -601,21 +607,21 @@ inval:
*/
static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
{
- unsigned long mask;
+ unsigned int mask;
+ int rv;
_enter(",%s", args);
- mask = simple_strtoul(args, &args, 0);
- if (args[0] != '\0')
- goto inval;
-
+ rv = parse_integer(args, 0, &mask);
+ if (rv < 0)
+ return rv;
+ if (args[rv] != '\0') {
+ pr_err("debug command requires mask\n");
+ return -EINVAL;
+ }
cachefiles_debug = mask;
_leave(" = 0");
return 0;
-
-inval:
- pr_err("debug command requires mask\n");
- return -EINVAL;
}
/*
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9..2ac2d8471393 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3390,13 +3390,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, GFP_KERNEL);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3417,10 +3417,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index,
GFP_KERNEL)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/dax.c b/fs/dax.c
index c3e21ccfc358..4e27a640b5e4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -484,6 +484,158 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = ((pmd_addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+ i_mmap_lock_read(mapping);
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ /* Guard against a race with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (is_huge_zero_pmd(*pmd))
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ bool set;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(mm, pmd);
+ set = set_huge_zero_page(NULL, mm, vma, pmd_addr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ result = VM_FAULT_NOPAGE;
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_page(kaddr + i * PAGE_SIZE);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ i_mmap_unlock_read(mapping);
+
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..258685b5e039 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3369,7 +3369,7 @@ static int __init set_dhash_entries(char *str)
{
if (!str)
return 0;
- dhash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &dhash_entries);
return 1;
}
__setup("dhash_entries=", set_dhash_entries);
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
inode_init_early();
}
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
{
- unsigned long reserve;
-
- /* Base hash sizes on available memory, with a reserve equal to
- 150% of current kernel size */
-
- reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
- mempages -= reserve;
-
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
dcache_init();
inode_init();
- files_init(mempages);
+ files_init();
+ files_maxfiles_init();
mnt_init();
bdev_cache_init();
chrdev_init();
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..d317441fe809 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -84,6 +84,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
substring_t args[MAX_OPT_ARGS];
int option;
bool s_pid = false;
+ int rv;
EXOFS_DBGMSG("parse_options %s\n", options);
/* defaults */
@@ -92,7 +93,6 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token;
- char str[32];
if (!*p)
continue;
@@ -108,9 +108,11 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
opts->is_osdname = true;
break;
case Opt_pid:
- if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+ rv = parse_integer(args[0].from, 0, &opts->pid);
+ if (rv < 0)
+ return rv;
+ if (args[0].from[rv] != '\0')
return -EINVAL;
- opts->pid = simple_strtoull(str, NULL, 0);
if (opts->pid < EXOFS_MIN_PID) {
EXOFS_ERR("Partition ID must be >= %u",
EXOFS_MIN_PID);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..1982c3f11aec 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19cf9ef6..a08ac730a38f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -383,16 +383,18 @@ static unsigned long get_sb_block(void **data)
{
unsigned long sb_block;
char *options = (char *) *data;
+ int rv;
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
options += 3;
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
+ rv = parse_integer(options, 0, &sb_block);
+ if (rv < 0 || (options[rv] && options[rv] != ',')) {
printk("EXT2-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
+ options += rv;
if (*options == ',')
options++;
*data = (void *) options;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..953d519e799c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -210,6 +211,13 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Is this the right get_block? */
}
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext4_get_block_write,
+ ext4_end_io_unwritten);
+}
+
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
@@ -217,6 +225,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +253,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fed7ee7ea6e8..85e7bf6f6533 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 06b4b14e8aa0..03b484a8e132 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1234,18 +1234,19 @@ static ext4_fsblk_t get_sb_block(void **data)
{
ext4_fsblk_t sb_block;
char *options = (char *) *data;
+ int rv;
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
options += 3;
- /* TODO: use simple_strtoll with >32bit ext4 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
+ rv = parse_integer(options, 0, &sb_block);
+ if (rv < 0 || (options[rv] && options[rv] != ',')) {
printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
+ options += rv;
if (*options == ',')
options++;
*data = (void *) options;
@@ -2515,10 +2516,10 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned long t;
+ unsigned int t;
int ret;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
@@ -2542,13 +2543,11 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- unsigned long t;
int ret;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, ui);
if (ret)
return ret;
- *ui = t;
return count;
}
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e41..4c71c8c76426 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..4307cd4f8da0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..43d3475da83a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,8 +14,12 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..d04c87da4255 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -553,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
#include <linux/hardirq.h>
#include <linux/task_work.h>
#include <linux/ima.h>
+#include <linux/swap.h>
#include <linux/atomic.h>
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
}
}
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
{
- unsigned long n;
-
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
- /*
- * One file with associated inode and dcache is very roughly 1K.
- * Per default don't use more than 10% of our memory for files.
- */
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+ unsigned long n;
+ unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+ memreserve = min(memreserve, totalram_pages - 1);
+ n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
- n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
- percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503e..221719eac5de 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -398,11 +397,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f52026..6fc766df0461 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708fd9331..63924662aaf3 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0cf74df68617..d977cae89d29 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -293,26 +294,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +357,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +428,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +445,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +474,166 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
+ return 0;
+}
+
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
return 0;
}
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma that just contains the policy used
+ * when allocating the huge pages. The actual policy field
+ * (vm_policy) is determined based on the index in the loop below.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Get policy based on index */
+ pseudo_vma.vm_policy =
+ mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ mpol_cond_put(pseudo_vma.vm_policy);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ mpol_cond_put(pseudo_vma.vm_policy);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +945,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -1010,6 +1255,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out_dentry;
+ if (creat_flags == HUGETLB_SHMFS_INODE)
+ inode->i_flags |= S_PRIVATE;
file = ERR_PTR(-ENOMEM);
if (hugetlb_reserve_pages(inode, 0,
diff --git a/fs/inode.c b/fs/inode.c
index d30640f7a193..e560535706ff 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1828,7 +1828,7 @@ static int __init set_ihash_entries(char *str)
{
if (!str)
return 0;
- ihash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &ihash_entries);
return 1;
}
__setup("ihash_entries=", set_ihash_entries);
diff --git a/fs/libfs.c b/fs/libfs.c
index 102edfd39000..0b01e7c4dd83 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -747,7 +747,6 @@ struct simple_attr {
int (*get)(void *, u64 *);
int (*set)(void *, u64);
char get_buf[24]; /* enough to store a u64 and "\n\0" */
- char set_buf[24];
void *data;
const char *fmt; /* format for read operation */
struct mutex mutex; /* protects access to these buffers */
@@ -825,31 +824,26 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
- u64 val;
- size_t size;
- ssize_t ret;
+ s64 val;
+ int ret;
attr = file->private_data;
if (!attr->set)
return -EACCES;
+ ret = kstrtos64_from_user(buf, len, 0, &val);
+ if (ret < 0)
+ return ret;
+
ret = mutex_lock_interruptible(&attr->mutex);
if (ret)
return ret;
-
- ret = -EFAULT;
- size = min(sizeof(attr->set_buf) - 1, len);
- if (copy_from_user(attr->set_buf, buf, size))
- goto out;
-
- attr->set_buf[size] = '\0';
- val = simple_strtoll(attr->set_buf, NULL, 0);
ret = attr->set(attr->data, val);
- if (ret == 0)
- ret = len; /* on success, claim we got the whole input */
-out:
mutex_unlock(&attr->mutex);
- return ret;
+ if (ret < 0)
+ return ret;
+ /* on success, claim we got the whole input */
+ return len;
}
EXPORT_SYMBOL_GPL(simple_attr_write);
diff --git a/fs/mpage.c b/fs/mpage.c
index ca0244b69de8..dde689d0759d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -617,7 +618,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -627,7 +628,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -639,7 +640,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0570729c87fd..68d7c07c3c2e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -37,7 +37,7 @@ static int __init set_mhash_entries(char *str)
{
if (!str)
return 0;
- mhash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &mhash_entries);
return 1;
}
__setup("mhash_entries=", set_mhash_entries);
@@ -47,7 +47,7 @@ static int __init set_mphash_entries(char *str)
{
if (!str)
return 0;
- mphash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &mphash_entries);
return 1;
}
__setup("mphash_entries=", set_mphash_entries);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+ !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..d675e76251d3 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -205,6 +205,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
mnt = NULL;
/*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+ * be expensive. It protects walking the *_fsnotify_marks lists.
+ * However, if we do not walk the lists, we do not have to do
+ * SRCU because we have no references to any objects and do not
+ * need SRCU to keep them "alive".
+ */
+ if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+ (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ return 0;
+ /*
* if this is a modify event we may need to clear the ignored masks
* otherwise return if neither the inode nor the vfsmount care about
* this type of event.
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 92e48c70f0f0..39ddcaf0918f 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -412,16 +412,36 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
unsigned int flags)
{
struct fsnotify_mark *lmark, *mark;
+ LIST_HEAD(to_free);
+ /*
+ * We have to be really careful here. Anytime we drop mark_mutex, e.g.
+ * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
+ * to_free list so we have to use mark_mutex even when accessing that
+ * list. And freeing mark requires us to drop mark_mutex. So we can
+ * reliably free only the first mark in the list. That's why we first
+ * move marks to free to to_free list in one go and then free marks in
+ * to_free list one by one.
+ */
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
- if (mark->flags & flags) {
- fsnotify_get_mark(mark);
- fsnotify_destroy_mark_locked(mark, group);
- fsnotify_put_mark(mark);
- }
+ if (mark->flags & flags)
+ list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&to_free)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_destroy_mark_locked(mark, group);
+ mutex_unlock(&group->mark_mutex);
+ fsnotify_put_mark(mark);
+ }
}
/*
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 9e1e112074fb..99503710d4bd 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
return true;
#ifdef NTFS_RW
iput_usnjrnl_err_out:
- if (vol->usnjrnl_j_ino)
- iput(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- iput(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- iput(vol->usnjrnl_ino);
+ iput(vol->usnjrnl_j_ino);
+ iput(vol->usnjrnl_max_ino);
+ iput(vol->usnjrnl_ino);
iput_quota_err_out:
- if (vol->quota_q_ino)
- iput(vol->quota_q_ino);
- if (vol->quota_ino)
- iput(vol->quota_ino);
+ iput(vol->quota_q_ino);
+ iput(vol->quota_ino);
iput(vol->extend_ino);
#endif /* NTFS_RW */
iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
iput(vol->root_ino);
iput_logfile_err_out:
#ifdef NTFS_RW
- if (vol->logfile_ino)
- iput(vol->logfile_ino);
+ iput(vol->logfile_ino);
iput_vol_err_out:
#endif /* NTFS_RW */
iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
iput(vol->mftbmp_ino);
iput_mirr_err_out:
#ifdef NTFS_RW
- if (vol->mftmirr_ino)
- iput(vol->mftmirr_ino);
+ iput(vol->mftmirr_ino);
#endif /* NTFS_RW */
return false;
}
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ struct buffer_head *bh = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
+ return status;
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int ret;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
-
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
return ERR_PTR(ret);
+ }
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
-
return acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..0afb4cb7ce1b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2526,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2872,8 +2847,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2967,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3040,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3131,6 +3093,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3200,7 +3186,7 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
+ "Owner %llu has empty extent block at %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
@@ -3628,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3666,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3714,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3735,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3770,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -3930,7 +3960,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
@@ -4355,10 +4385,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
@@ -4413,8 +4440,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
@@ -4970,10 +4996,9 @@ leftright:
split_index = ocfs2_search_extent_list(el, cpos);
if (split_index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u "
- "which can no longer be found.\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5158,10 +5183,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5228,9 +5252,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5337,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5514,8 +5545,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5580,7 +5610,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5596,8 +5626,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -5925,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5955,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6013,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -7111,12 +7137,20 @@ start:
ocfs2_error(inode->i_sb, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7204,8 +7238,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1a35c6139656..4ebc265fa66f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
alloc_locked = 1;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extend_allocation(inode, cpos,
clusters_to_alloc, 0);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(ret);
goto bail;
}
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
/*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
- ocfs2_iocb_clear_rw_locked(iocb);
+ /* Let rw unlock to be done later to protect append direct io write */
+ if (offset + bytes <= i_size_read(inode)) {
+ ocfs2_iocb_clear_rw_locked(iocb);
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ }
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
/* zeroing out the previously allocated cluster tail
* that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
zero_len_tail, cluster_align_tail);
- else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ }
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
offset, ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
- if (unlikely(written < 0)) {
+ /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+ if ((written < 0) && (written != -EIOCBQUEUED)) {
loff_t i_size = i_size_read(inode);
if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
goto clean_orphan;
}
}
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
ret = jbd2_journal_force_commit(journal);
if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
if (tmp_ret < 0) {
ret = tmp_ret;
mlog_errno(ret);
+ brelse(di_bh);
goto out;
}
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
* lock, or it will cause a deadlock since journal commit threads holds
* this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727ee..a20d490455b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -1061,37 +1061,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1119,18 +1088,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1484,13 +1453,12 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
- unsigned long bytes;
- char *p = (char *)page;
-
- bytes = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int bytes;
+ int rv;
+ rv = kstrtouint(page, 0, &bytes);
+ if (rv < 0)
+ return rv;
/* Heartbeat and fs min / max block sizes are the same. */
if (bytes > 4096 || bytes < 512)
return -ERANGE;
@@ -1543,18 +1511,14 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
const char *page,
size_t count)
{
- unsigned long long tmp;
- char *p = (char *)page;
+ int rv;
if (reg->hr_bdev)
return -EINVAL;
- tmp = simple_strtoull(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
- reg->hr_start_block = tmp;
-
+ rv = kstrtoull(page, 0, &reg->hr_start_block);
+ if (rv < 0)
+ return rv;
return count;
}
@@ -1568,20 +1532,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
const char *page,
size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
+ unsigned int tmp;
+ int rv;
if (reg->hr_bdev)
return -EINVAL;
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
+ rv = kstrtouint(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp > O2NM_MAX_NODES || tmp == 0)
return -ERANGE;
- reg->hr_blocks = (unsigned int)tmp;
+ reg->hr_blocks = tmp;
return count;
}
@@ -1620,17 +1583,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
- if (reg->hr_tmp_block == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_tmp_block == NULL)
return -ENOMEM;
- }
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
- if (reg->hr_slots == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_blocks; i++) {
slot = &reg->hr_slots[i];
@@ -1646,17 +1605,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
GFP_KERNEL);
- if (!reg->hr_slot_data) {
- mlog_errno(-ENOMEM);
+ if (!reg->hr_slot_data)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_num_pages; i++) {
page = alloc_page(GFP_KERNEL);
- if (!page) {
- mlog_errno(-ENOMEM);
+ if (!page)
return -ENOMEM;
- }
reg->hr_slot_data[i] = page;
@@ -1688,10 +1643,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_heartbeat_block *hb_block;
ret = o2hb_read_slots(reg, reg->hr_blocks);
- if (ret) {
- mlog_errno(ret);
+ if (ret)
goto out;
- }
/* We only want to get an idea of the values initially in each
* slot, so we do no verification - o2hb_check_slot will
@@ -1717,9 +1670,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
size_t count)
{
struct task_struct *hb_task;
- long fd;
+ int fd;
int sectsize;
- char *p = (char *)page;
struct fd f;
struct inode *inode;
ssize_t ret = -EINVAL;
@@ -1733,10 +1685,9 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
if (o2nm_this_node() == O2NM_MAX_NODES)
goto out;
- fd = simple_strtol(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
+ ret = kstrtoint(page, 0, &fd);
+ if (ret < 0)
goto out;
-
if (fd < 0 || fd >= INT_MAX)
goto out;
@@ -2210,12 +2161,12 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
const char *page,
size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
+ unsigned int tmp;
+ int rv;
- tmp = simple_strtoul(p, &p, 10);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ rv = kstrtouint(page, 10, &tmp);
+ if (rv < 0)
+ return rv;
/* this will validate ranges for us. */
o2hb_dead_threshold_set((unsigned int) tmp);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e169e6..0381ada38534 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -195,13 +195,12 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int tmp;
+ int rv;
+ rv = parse_integer(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp >= O2NM_MAX_NODES)
return -ERANGE;
@@ -215,16 +214,15 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ rv = -EEXIST;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
-
+ if (rv < 0)
+ return rv;
return count;
}
static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
@@ -235,13 +233,12 @@ static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
const char *page, size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ u16 tmp;
+ int rv;
+ rv = kstrtou16(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp == 0)
return -EINVAL;
if (tmp >= (u16)-1)
@@ -305,13 +302,11 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
- char *p = (char *)page;
ssize_t ret;
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
+ ret = kstrtoul(page, 0, &tmp);
+ if (ret < 0)
+ return ret;
tmp = !!tmp; /* boolean of whether this node wants to be local */
/* setting local turns on networking rx for now so we require having
@@ -484,16 +479,15 @@ struct o2nm_cluster_attribute {
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int tmp;
+ int rv;
+ rv = kstrtouint(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp == 0)
return -EINVAL;
- if (tmp >= (u32)-1)
+ if (tmp >= (unsigned int)-1)
return -ERANGE;
*val = tmp;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8b23aa2f52dd..23157e40dd74 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -4025,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..373a34f97452 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
+ int inode_locked = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock_rw;
}
+ inode_locked = 1;
if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status) {
+ ocfs2_inode_unlock(inode, 1);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock(inode, 1);
return status;
}
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
ssize_t written = 0;
ssize_t ret;
size_t count = iov_iter_count(from), orig_count;
- loff_t old_size;
- u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
int dropped_dio = 0;
+ int append_write = ((iocb->ki_pos + count) >=
+ i_size_read(inode) ? 1 : 0);
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
+ * For append write, we must take rw EX.
*/
- rw_level = (!direct_io || full_coherency);
+ rw_level = (!direct_io || full_coherency || append_write);
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
ocfs2_iocb_set_unaligned_aio(iocb);
}
- /*
- * To later detect whether a journal commit for sync writes is
- * necessary, we sample i_size, and cluster count here.
- */
- old_size = i_size_read(inode);
- old_clusters = OCFS2_I(inode)->ip_clusters;
-
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -2416,7 +2417,7 @@ no_sync:
unaligned_dio = 0;
}
- if (unaligned_dio) {
+ if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..3332af147bb4
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,569 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2007, 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* File check entries are done in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry {
+ struct list_head fs_list;
+ atomic_t fs_count;
+ struct super_block *fs_sb;
+ struct kset *fs_kset;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set file check options */
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned short fe_done:1;
+ unsigned short fe_status:15;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck =
+ __ATTR(filecheck, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ if (!atomic_dec_and_test(&entry->fs_count))
+ wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kset_unregister(entry->fs_kset);
+ kfree(entry->fs_fcheck);
+ kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ list_del(&p->fs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ ocfs2_filecheck_sysfs_free(p);
+ return 0;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->fs_count))
+ wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ atomic_inc(&p->fs_count);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return p;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+ int ret = 0;
+ struct kset *ocfs2_filecheck_kset = NULL;
+ struct ocfs2_filecheck *fcheck = NULL;
+ struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+ struct attribute **attrs = NULL;
+ struct attribute_group attrgp;
+
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ attrs = kmalloc(sizeof(struct attribute *) * 2, GFP_NOFS);
+ if (!attrs) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ attrs[0] = &ocfs2_attr_filecheck.attr;
+ attrs[1] = NULL;
+ memset(&attrgp, 0, sizeof(attrgp));
+ attrgp.attrs = attrs;
+ }
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+ }
+
+ if (strlen(sb->s_id) <= 0) {
+ mlog(ML_ERROR,
+ "Cannot get device basename when create filecheck sysfs\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ ocfs2_filecheck_kset = kset_create_and_add(sb->s_id, NULL,
+ &ocfs2_kset->kobj);
+ if (!ocfs2_filecheck_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = sysfs_create_group(&ocfs2_filecheck_kset->kobj, &attrgp);
+ if (ret)
+ goto error;
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ atomic_set(&entry->fs_count, 1);
+ entry->fs_sb = sb;
+ entry->fs_kset = ocfs2_filecheck_kset;
+ entry->fs_fcheck = fcheck;
+ ocfs2_filecheck_sysfs_add(entry);
+ }
+
+ kfree(attrs);
+ return 0;
+
+error:
+ kfree(attrs);
+ kfree(entry);
+ kfree(fcheck);
+ kset_unregister(ocfs2_filecheck_kset);
+ return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+ return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_ERROR,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 32
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ if (count < 1)
+ return 1;
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+
+ /* too short/long args length */
+ if ((count < 5) || (count > OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if ((strncmp(buf, "FIX ", 4) == 0) ||
+ (strncmp(buf, "fix ", 4) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 4, count - 4, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_FIX;
+ args->fa_ino = val;
+ return 0;
+ } else if ((strncmp(buf, "CHECK ", 6) == 0) ||
+ (strncmp(buf, "check ", 6) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 6, count - 6, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_CHK;
+ args->fa_ino = val;
+ return 0;
+ } else if ((strncmp(buf, "SET ", 4) == 0) ||
+ (strncmp(buf, "set ", 4) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 4, count - 4, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_SET;
+ args->fa_len = (unsigned int)val;
+ return 0;
+ } else { /* invalid args */
+ return 1;
+ }
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ret = snprintf(buf, remain, "INO\t\tTYPE\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%u\t%s\n",
+ p->fe_ino, p->fe_type, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ ocfs2_filecheck_sysfs_put(ent);
+ return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ entry->fe_done = 1;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned short
+ocfs2_filecheck_handle(struct super_block *sb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned short ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry = NULL;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+ ssize_t ret = 0;
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(buf, count, &args)) {
+ mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ return -EINVAL;
+ }
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ ocfs2_filecheck_sysfs_put(ent);
+ return (!ret ? count : ret);
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_ERROR,
+ "Online file check queue(%u) is full\n",
+ ent->fs_fcheck->fc_max);
+ ret = -EBUSY;
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+ if (entry) {
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list,
+ &ent->fs_fcheck->fc_head);
+
+ ent->fs_fcheck->fc_size++;
+ ret = count;
+ } else {
+ ret = -ENOMEM;
+ }
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (entry)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+ ocfs2_filecheck_sysfs_put(ent);
+ return ret;
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..c65fee927387
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,48 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..62a47ad3a8fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -74,6 +75,13 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh, int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +135,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = 0;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
@@ -161,12 +170,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
trace_ocfs2_iget5_locked(inode->i_state);
if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+ (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+ /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+ inode = ERR_PTR(rc);
+ else
+ inode = ERR_PTR(-ESTALE);
goto bail;
}
@@ -494,16 +508,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -531,6 +561,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh)) {
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
@@ -971,6 +1009,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1064,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1238,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,32 +1399,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
@@ -1385,6 +1434,152 @@ bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ }
+
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ goto bail;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ rc = ocfs2_filecheck_validate_inode_block(sb, bh);
+ /* Can't fix invalid inode block */
+ if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
+ return rc;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: try to repair dinode #%llu on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is set\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (changed ||
+ ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh, int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..2152a72123bc 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- wait_queue_head_t append_dio_wq;
-
struct dquot *i_dquot[MAXQUOTAS];
};
@@ -139,6 +137,9 @@ int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
mlog(ML_ERROR, "b_blocknr=%llu\n",
(unsigned long long)bh->b_blocknr);
- BUG();
+
+ lock_buffer(bh);
+ /*
+ * A previous attempt to write this buffer head failed.
+ * Nothing we can do but to retry the write and hope for
+ * the best.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+ unlock_buffer(bh);
}
/* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
+ mutex_lock(&inode->i_mutex);
ret = ocfs2_rw_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+ }
+
+ if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
ret = ocfs2_truncate_file(inode, di_bh,
i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
if (ret)
mlog_errno(ret);
-
- wake_up(&OCFS2_I(inode)->append_dio_wq);
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
unlock_inode:
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
unlock_rw:
ocfs2_rw_unlock(inode, 1);
next:
+ mutex_unlock(&inode->i_mutex);
iput(inode);
- brelse(di_bh);
- di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u used bits, but a count shows %u",
+ ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..5de241708c87 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1309,6 +1309,15 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ mlog(ML_ERROR, "new dir %llu has been removed, inode %llu "
+ "can not be moved into it.",
+ (unsigned long long)new_dir->i_ino,
+ (unsigned long long)old_inode->i_ino);
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1569,12 +1578,25 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
@@ -2601,27 +2623,6 @@ leave:
return status;
}
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- return 0;
- }
-
- di = (struct ocfs2_dinode *) di_bh->b_data;
- ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode)
{
@@ -2633,7 +2634,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct ocfs2_dinode *di = NULL;
-restart:
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
mlog_errno(status);
@@ -2643,15 +2643,21 @@ restart:
di = (struct ocfs2_dinode *) di_bh->b_data;
/*
* Another append dio crashed?
- * If so, wait for recovery first.
+ * If so, manually recover it first.
*/
if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
- ocfs2_dio_orphan_recovered(inode),
- msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
- goto restart;
+ status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
}
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..d9205e07aaef 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1540,6 +1540,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_block,
(unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2768eb1da2b8..eaabdd259f3b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -368,9 +368,9 @@ static int ocfs2_control_get_this_node(void)
static int ocfs2_control_do_setnode_msg(struct file *file,
struct ocfs2_control_message_setn *msg)
{
- long nodenum;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
+ int nodenum;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -384,12 +384,12 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
return -EINVAL;
msg->space = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->nodestr, 16, &nodenum);
+ if (rv < 0)
+ return rv;
+ if (msg->nodestr[rv])
return -EINVAL;
-
- if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
- (nodenum > INT_MAX) || (nodenum < 0))
+ if (nodenum < 0)
return -ERANGE;
p->op_this_node = nodenum;
@@ -399,11 +399,11 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
{
- long major, minor;
- char *ptr = NULL;
+ u8 major, minor;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
&ocfs2_user_plugin.sp_max_proto;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -418,11 +418,15 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- major = simple_strtol(msg->major, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->major, 16, &major);
+ if (rv < 0)
+ return rv;
+ if (msg->major[rv])
return -EINVAL;
- minor = simple_strtol(msg->minor, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->minor, 16, &minor);
+ if (rv < 0)
+ return rv;
+ if (msg->minor[rv])
return -EINVAL;
/*
@@ -430,11 +434,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
* must be between 0 and 255, inclusive. The version passed in
* must be within the maximum version supported by the filesystem.
*/
- if ((major == LONG_MIN) || (major == LONG_MAX) ||
- (major > (u8)-1) || (major < 1))
- return -ERANGE;
- if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
- (minor > (u8)-1) || (minor < 0))
+ if (major < 1)
return -ERANGE;
if ((major != max->pv_major) ||
(minor > max->pv_minor))
@@ -449,8 +449,8 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
static int ocfs2_control_do_down_msg(struct file *file,
struct ocfs2_control_message_down *msg)
{
- long nodenum;
- char *p = NULL;
+ int nodenum;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -465,12 +465,12 @@ static int ocfs2_control_do_down_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &p, 16);
- if (!p || *p)
+ rv = parse_integer(msg->nodestr, 16, &nodenum);
+ if (rv < 0)
+ return rv;
+ if (msg->nodestr[rv])
return -EINVAL;
-
- if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
- (nodenum > INT_MAX) || (nodenum < 0))
+ if (nodenum < 0)
return -ERANGE;
ocfs2_control_send_down(msg->uuid, nodenum);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..0456ae399bf7 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -181,44 +181,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -233,20 +224,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +242,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -834,9 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -1370,12 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1890,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -2429,12 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..f56a5458f01e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,6 +74,7 @@
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
@@ -192,6 +193,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +226,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1202,6 +1205,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
+ /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+ ocfs2_filecheck_create_sysfs(sb);
+
return status;
read_super_error:
@@ -1330,10 +1336,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1545,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1658,6 +1675,7 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
+ ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1746,8 +1764,6 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- init_waitqueue_head(&oi->append_dio_wq);
-
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
@@ -2541,31 +2557,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
- panic("OCFS2: (device %s): panic forced after error\n",
- sb->s_id);
+ int rv = 0;
ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
-
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+ panic("OCFS2: (device %s): panic forced after error\n",
+ sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
+
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
+
+ return rv;
}
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
@@ -2577,12 +2605,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2627,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list + prefix_len, name, name_len);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
- kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
rcu_read_lock();
cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_permitted = cred->cap_permitted;
cap_effective = cred->cap_effective;
cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
rcu_read_unlock();
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
}
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..60c71b10eaee 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1836,8 +1836,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1864,11 +1862,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1957,6 +1950,29 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1972,7 +1988,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;
@@ -1996,10 +2012,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;
- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -2053,10 +2065,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2245,7 +2253,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2744,9 +2751,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..9daa6e92450f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,16 +5,20 @@
#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <linux/huge_mm.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
#include <linux/kernel-page-flags.h>
#include <asm/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
/* /proc/kpagecount - an array exposing page counts
*
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -225,10 +236,285 @@ static const struct file_operations proc_kpageflags_operations = {
.read = kpageflags_read,
};
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+ .llseek = mem_lseek,
+ .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *kpageidle_get_page(unsigned long pfn)
+{
+ struct page *page;
+ struct zone *zone;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (!page || !PageLRU(page) ||
+ !get_page_unless_zero(page))
+ return NULL;
+
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (unlikely(!PageLRU(page))) {
+ put_page(page);
+ page = NULL;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ return page;
+}
+
+static int kpageidle_clear_pte_refs_one(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool referenced = false;
+
+ if (unlikely(PageTransHuge(page))) {
+ pmd = page_check_address_pmd(page, mm, addr,
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ if (pmd) {
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ spin_unlock(ptl);
+ }
+ } else {
+ pte = page_check_address(page, mm, addr, &ptl, 0);
+ if (pte) {
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap_unlock(pte, ptl);
+ }
+ }
+ if (referenced) {
+ clear_page_idle(page);
+ /*
+ * We cleared the referenced bit in a mapping to this page. To
+ * avoid interference with page reclaim, mark it young so that
+ * page_referenced() will return > 0.
+ */
+ set_page_young(page);
+ }
+ return SWAP_AGAIN;
+}
+
+static void kpageidle_clear_pte_refs(struct page *page)
+{
+ /*
+ * Since rwc.arg is unused, rwc is effectively immutable, so we
+ * can make it static const to save some cycles and stack.
+ */
+ static const struct rmap_walk_control rwc = {
+ .rmap_one = kpageidle_clear_pte_refs_one,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!page_mapped(page) ||
+ !page_rmapping(page))
+ return;
+
+ need_lock = !PageAnon(page) || PageKsm(page);
+ if (need_lock && !trylock_page(page))
+ return;
+
+ rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+ if (need_lock)
+ unlock_page(page);
+}
+
+static ssize_t kpageidle_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ ssize_t ret = 0;
+ u64 idle_bitmap = 0;
+ int bit;
+
+ if (*ppos & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ pfn = *ppos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return 0;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = ALIGN(max_pfn, KPMBITS);
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % KPMBITS;
+ page = kpageidle_get_page(pfn);
+ if (page) {
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ kpageidle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ idle_bitmap |= 1ULL << bit;
+ }
+ put_page(page);
+ }
+ if (bit == KPMBITS - 1) {
+ if (put_user(idle_bitmap, out)) {
+ ret = -EFAULT;
+ break;
+ }
+ idle_bitmap = 0;
+ out++;
+ }
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static ssize_t kpageidle_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const u64 __user *in = (const u64 __user *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ ssize_t ret = 0;
+ u64 idle_bitmap = 0;
+ int bit;
+
+ if (*ppos & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ pfn = *ppos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return -ENXIO;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = ALIGN(max_pfn, KPMBITS);
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % KPMBITS;
+ if (bit == 0) {
+ if (get_user(idle_bitmap, in)) {
+ ret = -EFAULT;
+ break;
+ }
+ in++;
+ }
+ if ((idle_bitmap >> bit) & 1) {
+ page = kpageidle_get_page(pfn);
+ if (page) {
+ kpageidle_clear_pte_refs(page);
+ set_page_idle(page);
+ put_page(page);
+ }
+ }
+ cond_resched();
+ }
+
+ *ppos += (const char __user *)in - buf;
+ if (!ret)
+ ret = (const char __user *)in - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpageidle_operations = {
+ .llseek = mem_lseek,
+ .read = kpageidle_read,
+ .write = kpageidle_write,
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+ return true;
+}
+struct page_ext_operations page_idle_ops = {
+ .need = need_page_idle,
+};
+#endif
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ proc_create("kpageidle", S_IRUSR | S_IWUSR, NULL,
+ &proc_kpageidle_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..385e6e0fd145 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -446,6 +446,7 @@ struct mem_size_stats {
unsigned long anonymous_thp;
unsigned long swap;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +459,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
@@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -579,6 +591,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_INTEL_MPX
[ilog2(VM_MPX)] = "mp",
#endif
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@@ -597,6 +610,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
};
size_t i;
@@ -638,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -652,9 +668,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
- (vma->vm_flags & VM_LOCKED) ?
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
show_smap_vma_flags(m, vma);
@@ -710,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -808,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -835,6 +836,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -887,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -961,36 +956,26 @@ typedef struct {
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1011,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1031,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1042,67 +1027,42 @@ out:
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
-
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+ return make_pme(frame, flags);
}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1111,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1158,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1209,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1227,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
- pm.v2 = soft_dirty_cleared;
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
-
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1273,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1303,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1313,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1338,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..263b125dbcf4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_pad);
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii)
+{
+ const u8 *ptr = buf;
+ int i, linelen, remaining = len;
+ int ret;
+
+ if (rowsize != 16 && rowsize != 32)
+ rowsize = 16;
+
+ for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+ linelen = min(remaining, rowsize);
+ remaining -= rowsize;
+
+ switch (prefix_type) {
+ case DUMP_PREFIX_ADDRESS:
+ seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+ break;
+ case DUMP_PREFIX_OFFSET:
+ seq_printf(m, "%s%.8x: ", prefix_str, i);
+ break;
+ default:
+ seq_printf(m, "%s", prefix_str);
+ break;
+ }
+
+ ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+ m->buf + m->count, m->size - m->count,
+ ascii);
+ if (ret >= m->size - m->count) {
+ seq_set_overflow(m);
+ } else {
+ m->count += ret;
+ seq_putc(m, '\n');
+ }
+ }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7e412ad74836..270221fcef42 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (kinfo->si_code == BUS_MCEERR_AR ||
- kinfo->si_code == BUS_MCEERR_AO)
+ if (kinfo->si_signo == SIGBUS &&
+ (kinfo->si_code == BUS_MCEERR_AR ||
+ kinfo->si_code == BUS_MCEERR_AO))
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif
diff --git a/fs/super.c b/fs/super.c
index b61372354f2b..c917817c8d40 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -49,8 +49,8 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
- * take a passive reference to the superblock to avoid this from occurring.
+ * shrinker path. Hence we take a passive reference to the superblock to avoid
+ * this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
@@ -121,8 +121,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
* Don't call trylock_super as it is a potential
* scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway.
- * Call to super_cache_count with shrinker_rwsem held
- * ensures the safety of call to list_lru_shrink_count() and
+ * SRCU guarantees object validity across this call -- thus
+ * it is safe to call list_lru_shrink_count() and
* s_op->nr_cached_objects().
*/
if (sb->s_op && sb->s_op->nr_cached_objects)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..634e676072cb
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1330 @@
+/*
+ * fs/userfaultfd.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+ UFFD_STATE_WAIT_API,
+ UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ struct seqcount refile_seq;
+ /* pseudo fd refcounting */
+ atomic_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* state machine */
+ enum userfaultfd_state state;
+ /* released */
+ bool released;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_t wq;
+ struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ ret = wake_up_state(wq->private, mode);
+ if (ret)
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the
+ * other CPUs, the waitqueue may disappear from under
+ * us, see the !list_empty_careful() in
+ * handle_userfault(). try_to_wake_up() has an
+ * implicit smp_mb__before_spinlock, and the
+ * wq->private is read before calling the extern
+ * function "wake_up_state" (which in turns calls
+ * try_to_wake_up). While the spin_lock;spin_unlock;
+ * wouldn't be enough, the smp_mb__before_spinlock is
+ * enough to avoid an explicit smp_mb() here.
+ */
+ list_del_init(&wq->task_list);
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ if (!atomic_inc_not_zero(&ctx->refcount))
+ BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
+ if (flags & FAULT_FLAG_WRITE)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ /*
+ * READ_ONCE must function as a barrier with narrower scope
+ * and it must be equivalent to:
+ * _pmd = *pmd; barrier();
+ *
+ * This is to deal with the instability (as in
+ * pmd_trans_unstable) of the pmd.
+ */
+ _pmd = READ_ONCE(*pmd);
+ if (!pmd_present(_pmd))
+ goto out;
+
+ ret = false;
+ if (pmd_trans_huge(_pmd))
+ goto out;
+
+ /*
+ * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+ * and use the standard pte_offset_map() instead of parsing _pmd.
+ */
+ pte = pte_offset_map(pmd, address);
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (pte_none(*pte))
+ ret = true;
+ pte_unmap(pte);
+
+out:
+ return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ int ret;
+ bool must_wait, return_to_userland;
+
+ BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ ret = VM_FAULT_SIGBUS;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ BUG_ON(ctx->mm != mm);
+
+ VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+ VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+ /*
+ * If it's already released don't get it. This avoids to loop
+ * in __get_user_pages if userfaultfd_release waits on the
+ * caller of handle_userfault to release the mmap_sem.
+ */
+ if (unlikely(ACCESS_ONCE(ctx->released)))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ /* take the reference before dropping the mmap_sem */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(address, flags, reason);
+ uwq.ctx = ctx;
+
+ return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ up_read(&mm->mmap_sem);
+
+ if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ (return_to_userland ? !signal_pending(current) :
+ !fatal_signal_pending(current)))) {
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+ ret |= VM_FAULT_MAJOR;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (return_to_userland) {
+ if (signal_pending(current) &&
+ !fatal_signal_pending(current)) {
+ /*
+ * If we got a SIGSTOP or SIGCONT and this is
+ * a normal userland page fault, just let
+ * userland return so the signal will be
+ * handled and gdb debugging works. The page
+ * fault code immediately after we return from
+ * this function is going to release the
+ * mmap_sem and it's not depending on it
+ * (unlike gup would if we were not to return
+ * VM_FAULT_RETRY).
+ *
+ * If a fatal signal is pending we still take
+ * the streamlined VM_FAULT_RETRY failure path
+ * and there's no need to retake the mmap_sem
+ * in such case.
+ */
+ down_read(&mm->mmap_sem);
+ ret = 0;
+ }
+ }
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.task_list)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+ unsigned long new_flags;
+
+ ACCESS_ONCE(ctx->released) = true;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_sem. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ up_write(&mm->mmap_sem);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ unsigned int ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ switch (ctx->state) {
+ case UFFD_STATE_WAIT_API:
+ return POLLERR;
+ case UFFD_STATE_RUNNING:
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return POLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
+ return ret;
+ default:
+ BUG();
+ }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.task_list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ write_seqcount_end(&ctx->refile_seq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->fd_wqh.lock);
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ int no_wait = file->f_flags & O_NONBLOCK;
+
+ if (ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ for (;;) {
+ if (count < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = O_NONBLOCK;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned long start, end;
+
+ start = range->start;
+ end = range->start + range->len;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned seq;
+ bool need_wakeup;
+
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or up_read(mmap_sem) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start < mmap_min_addr)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ unsigned long vm_flags, new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+ UFFDIO_REGISTER_MODE_WP))
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ vm_flags |= VM_UFFD_WP;
+ /*
+ * FIXME: remove the below error constraint by
+ * implementing the wprotect tracking mode.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ ((struct vm_userfaultfd_ctx){ ctx }));
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+ if (!ret) {
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(UFFD_API_RANGE_IOCTLS,
+ &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ unsigned long new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ ret = -EINVAL;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_BUG_ON(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+ * double check for wraparound just in case. copy_from_user()
+ * will later check uffdio_copy.src + uffdio_copy.len to fit
+ * in the userland range.
+ */
+ ret = -EINVAL;
+ if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+ goto out;
+ if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ BUG_ON(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ int ret;
+
+ ret = -EINVAL;
+ if (ctx->state != UFFD_STATE_WAIT_API)
+ goto out;
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ret = -EINVAL;
+ goto out;
+ }
+ uffdio_api.features = UFFD_API_FEATURES;
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ctx->state = UFFD_STATE_RUNNING;
+ ret = 0;
+out:
+ return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ switch(cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ total++;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, UFFD_API_FEATURES,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read = userfaultfd_read,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = userfaultfd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_init(&ctx->refile_seq);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase. In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided. Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+ struct file *file;
+ struct userfaultfd_ctx *ctx;
+
+ BUG_ON(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+ file = ERR_PTR(-EINVAL);
+ if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ goto out;
+
+ file = ERR_PTR(-ENOMEM);
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto out;
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->state = UFFD_STATE_WAIT_API;
+ ctx->released = false;
+ ctx->mm = current->mm;
+ /* prevent the mm struct to be freed */
+ atomic_inc(&ctx->mm->mm_users);
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (IS_ERR(file))
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+ return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = userfaultfd_file_create(flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1ccf8264..c79b717d9b88 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f0e8249722d4..72f10f373a17 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1530,8 +1530,36 @@ xfs_filemap_fault(
return ret;
}
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
@@ -1544,7 +1572,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8d916d33d93d..8229caedfaaa 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
+ kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
}
#endif
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+ return cap_issubset(cred->cap_ambient,
+ cap_intersect(cred->cap_permitted,
+ cred->cap_inheritable));
+}
+
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
diff --git a/include/linux/dax.h b/include/linux/dax.h
new file mode 100644
index 000000000000..b415e521528d
--- /dev/null
+++ b/include/linux/dax.h
@@ -0,0 +1,39 @@
+#ifndef _LINUX_DAX_H
+#define _LINUX_DAX_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+ get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+#else
+static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags, get_block_t gb,
+ dax_iodone_t di)
+{
+ return VM_FAULT_FALLBACK;
+}
+#define __dax_pmd_fault dax_pmd_fault
+#endif
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+ return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf3246512a26..b6361e2e2a26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -51,11 +51,11 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
-struct vm_fault;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
extern struct files_stat_struct files_stat;
extern unsigned long get_max_files(void);
@@ -1611,7 +1611,6 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
@@ -2246,7 +2245,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
extern struct kmem_cache *names_cachep;
@@ -2667,19 +2666,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
- get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
-
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
loff_t file_offset);
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
genpool_algo_t algo; /* allocation function */
void *data;
+
+ const char *name;
};
/*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr, void *data);
extern struct gen_pool *devm_gen_pool_create(struct device *dev,
- int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+ int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
size_t size);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ad35f300b9a4..3bd64b115999 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -63,7 +63,10 @@ struct vm_area_struct;
* but it is definitely preferable to use the flag rather than opencode endless
* loop around allocator.
*
- * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed. The OOM killer is not called with the current
+ * implementation.
*
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
* mechanism or reclaimed
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..cff4e4bc7fab 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
@@ -33,6 +36,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
+int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned long pfn, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
@@ -56,6 +61,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -122,7 +128,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
#endif
extern int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice);
-extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
@@ -138,15 +144,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
else
return 0;
}
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
-{
- if (!vma->anon_vma || vma->vm_ops)
- return;
- __vma_adjust_trans_huge(vma, start, end, adjust_next);
-}
static inline int hpage_nr_pages(struct page *page)
{
if (unlikely(PageTransHuge(page)))
@@ -164,6 +161,16 @@ static inline bool is_huge_zero_page(struct page *page)
return ACCESS_ONCE(huge_zero_page) == page;
}
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_page(pmd_page(pmd));
+}
+
+struct page *get_huge_zero_page(void);
+bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr,
+ pmd_t *pmd, struct page *zero_page);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d891f949466a..5e35379f58a5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,9 @@ struct resv_map {
struct kref refs;
spinlock_t lock;
struct list_head regions;
+ long adds_in_progress;
+ struct list_head region_cache;
+ long region_cache_count;
};
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -80,11 +83,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed);
int dequeue_hwpoisoned_huge_page(struct page *page);
bool isolate_huge_page(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
void free_huge_page(struct page *page);
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+extern struct mutex *hugetlb_fault_mutex_table;
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -320,9 +330,13 @@ struct huge_bootmem_page {
#endif
};
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -471,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_noerr(v, a, r) NULL
#define alloc_bootmem_huge_page(h) NULL
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5582410727cb..45a9fdcc0844 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -10,6 +10,7 @@
#include <linux/bitops.h>
#include <linux/log2.h>
#include <linux/typecheck.h>
+#include <linux/parse-integer.h>
#include <linux/printk.h>
#include <linux/dynamic_debug.h>
#include <asm/byteorder.h>
@@ -264,132 +265,10 @@ void do_exit(long error_code)
void complete_and_exit(struct completion *, long)
__noreturn;
-/* Internal, do not use. */
-int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
-int __must_check _kstrtol(const char *s, unsigned int base, long *res);
-
-int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
-
-/**
- * kstrtoul - convert a string to an unsigned long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
-*/
-static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
- */
- if (sizeof(unsigned long) == sizeof(unsigned long long) &&
- __alignof__(unsigned long) == __alignof__(unsigned long long))
- return kstrtoull(s, base, (unsigned long long *)res);
- else
- return _kstrtoul(s, base, res);
-}
-
-/**
- * kstrtol - convert a string to a long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
+/*
+ * Obsolete, do not use.
+ * Use parse_integer(), kstrto*(), kstrto*_from_user(), sscanf().
*/
-static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(long, long long) = 0.
- */
- if (sizeof(long) == sizeof(long long) &&
- __alignof__(long) == __alignof__(long long))
- return kstrtoll(s, base, (long long *)res);
- else
- return _kstrtol(s, base, res);
-}
-
-int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
-int __must_check kstrtoint(const char *s, unsigned int base, int *res);
-
-static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
-{
- return kstrtoull(s, base, res);
-}
-
-static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
-{
- return kstrtoll(s, base, res);
-}
-
-static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
-{
- return kstrtouint(s, base, res);
-}
-
-static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
-{
- return kstrtoint(s, base, res);
-}
-
-int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
-int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
-int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
-int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
-
-int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
-int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
-int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
-int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
-int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
-int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
-int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
-int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
-int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
-
-static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
-{
- return kstrtoull_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
-{
- return kstrtoll_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
-{
- return kstrtouint_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
-{
- return kstrtoint_from_user(s, count, base, res);
-}
-
-/* Obsolete, do not use. Use kstrto<foo> instead */
-
extern unsigned long simple_strtoul(const char *,char **,unsigned int);
extern long simple_strtol(const char *,char **,unsigned int);
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e804306ef5e8..5f193d80a6fb 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,7 +16,7 @@
#include <uapi/linux/kexec.h>
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <linux/list.h>
#include <linux/linkage.h>
#include <linux/compat.h>
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -318,12 +320,12 @@ int crash_shrink_memory(unsigned long new_size);
size_t crash_get_memory_size(void);
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
struct task_struct;
static inline void crash_kexec(struct pt_regs *regs) { }
static inline int kexec_should_crash(struct task_struct *p) { return 0; }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
#endif /* !defined(__ASSEBMLY__) */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 13d55206ccf6..3e6773e0c849 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[], ...);
#define kthread_create(threadfn, data, namefmt, arg...) \
- kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+ kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cc4b01972060..c518eb589260 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
+bool memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
@@ -323,7 +325,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
int memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
int memblock_is_reserved(phys_addr_t addr);
-int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73b02b0a8f60..ad800e62cb7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
struct mem_cgroup;
struct page;
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
MEMCG_NR_EVENTS,
};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
+ MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+ /* Currently active and new sockets should be assigned to cgroups */
+ MEMCG_SOCK_ACTIVE,
+ /* It was ever activated; we must disarm static keys on destruction */
+ MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+ struct page_counter memory_allocated; /* Current allocated memory. */
+ struct percpu_counter sockets_allocated; /* Current number of sockets. */
+ int memory_pressure;
+ long sysctl_mem[3];
+ unsigned long flags;
+ /*
+ * memcg field is used to find which memcg we belong directly
+ * Each memcg struct can hold more than one cg_proto, so container_of
+ * won't really cut.
+ *
+ * The elegant solution would be having an inverse function to
+ * proto_cgroup in struct proto, but that means polluting the structure
+ * for everybody, instead of just for memcg users.
+ */
+ struct mem_cgroup *memcg;
+};
+
#ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+ struct mem_cgroup *position;
+ /* scan generation, increased every round-trip */
+ unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+ struct lruvec lruvec;
+ unsigned long lru_size[NR_LRU_LISTS];
+
+ struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
+ /* use container_of */
+};
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+ struct eventfd_ctx *eventfd;
+ unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+ /* An array index points to threshold just below or equal to usage. */
+ int current_threshold;
+ /* Size of entries[] */
+ unsigned int size;
+ /* Array of thresholds */
+ struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+
+ /* Accounted resources */
+ struct page_counter memory;
+ struct page_counter memsw;
+ struct page_counter kmem;
+
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
+ unsigned long soft_limit;
+
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
+ /* css_online() has been completed */
+ int initialized;
+
+ /*
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+
+ /* protected by memcg_oom_lock */
+ bool oom_lock;
+ int under_oom;
+
+ int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
+
+ /* protect arrays of thresholds */
+ struct mutex thresholds_lock;
+
+ /* thresholds for memory usage. RCU-protected */
+ struct mem_cgroup_thresholds thresholds;
+
+ /* thresholds for mem+swap usage. RCU-protected */
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+
+ /*
+ * Should we move charges of a task when a task is moved into this
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+ /*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+ /* taken only while moving_account > 0 */
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
+ /*
+ * percpu counter.
+ */
+ struct mem_cgroup_stat_cpu __percpu *stat;
+ spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+ struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
+ int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
+#endif
+
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
+};
extern struct cgroup_subsys_state *mem_cgroup_root_css;
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
- unsigned int nr);
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
+}
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@ -90,15 +304,29 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
- struct mem_cgroup *root);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+ return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+ struct mem_cgroup *,
+ struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+ struct mem_cgroup *root)
+{
+ if (root == memcg)
+ return true;
+ if (!root->use_hierarchy)
+ return false;
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
@@ -114,24 +342,68 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
- struct mem_cgroup *,
- struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+ if (memory_cgrp_subsys.disabled)
+ return true;
+ return false;
+}
/*
* For memory reclaim.
*/
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
- struct task_struct *p);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+ unsigned long inactive_ratio;
+ unsigned long inactive;
+ unsigned long active;
+ unsigned long gb;
+
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
+}
+
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+ struct task_struct *p);
static inline void mem_cgroup_oom_enable(void)
{
@@ -156,18 +428,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
extern int do_swap_account;
#endif
-static inline bool mem_cgroup_disabled(void)
-{
- if (memory_cgrp_subsys.disabled)
- return true;
- return false;
-}
-
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val);
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
+}
+
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
@@ -184,13 +464,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
enum vm_event_item idx)
{
+ struct mem_cgroup *memcg;
+
if (mem_cgroup_disabled())
return;
- __mem_cgroup_count_vm_event(mm, idx);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ goto out;
+
+ switch (idx) {
+ case PGFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+ break;
+ case PGMAJFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
@@ -199,8 +497,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#else /* CONFIG_MEMCG */
struct mem_cgroup;
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
@@ -258,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
return &zone->lruvec;
}
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- return NULL;
-}
-
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
{
@@ -275,12 +566,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
-static inline struct cgroup_subsys_state
- *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return NULL;
-}
-
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -428,8 +713,8 @@ static inline void sock_release_memcg(struct sock *sk)
extern struct static_key memcg_kmem_enabled_key;
extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
/*
* Helper macro to loop through all memcg-specific caches. Callers must still
@@ -444,7 +729,10 @@ static inline bool memcg_kmem_enabled(void)
return static_key_false(&memcg_kmem_enabled_key);
}
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return memcg->kmem_acct_active;
+}
/*
* In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +751,15 @@ void __memcg_kmem_commit_charge(struct page *page,
struct mem_cgroup *memcg, int order);
void __memcg_kmem_uncharge_pages(struct page *page, int order);
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e872f92dbac..0806a3134b85 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,9 +124,12 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
+#define VM_LOCKONFAULT 0x00001000 /* Lock the pages covered when they are faulted in */
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -245,7 +248,10 @@ struct vm_fault {
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
+ int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+ pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
/* notification that a previously read-only page is about to become
@@ -304,18 +310,6 @@ struct inode;
#define page_private(page) ((page)->private)
#define set_page_private(page, v) ((page)->private = (v))
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
- page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
- return page->index;
-}
-
/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
@@ -356,18 +350,6 @@ static inline int get_page_unless_zero(struct page *page)
return atomic_inc_not_zero(&page->_count);
}
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
- return atomic_add_unless(&page->_count, -1, 1);
-}
-
extern int page_is_ram(unsigned long pfn);
extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
@@ -437,46 +419,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
#endif
}
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
- struct page *head = tail->first_page;
-
- /*
- * page->first_page may be a dangling pointer to an old
- * compound page, so recheck that it is still a tail
- * page before returning.
- */
- smp_rmb();
- if (likely(PageTail(tail)))
- return head;
- return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return compound_head_by_tail(page);
- return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
-}
-
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
@@ -1229,6 +1171,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
}
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops;
+}
+
static inline int stack_guard_page_start(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -1805,7 +1752,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *);
+ struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1865,6 +1812,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
/* Ignore errors */
(void) __mm_populate(addr, len, 1);
}
+extern int mm_lock_present(unsigned long addr, unsigned long start);
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
@@ -2243,5 +2191,103 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+ return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+ SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+ SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+ ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+ return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+ set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return test_and_clear_bit(PAGE_EXT_YOUNG,
+ &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+ set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+ clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+static inline bool page_is_young(struct page *page)
+{
+ return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0038ac7466fd..4957bd3e03e8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -265,6 +265,16 @@ struct vm_region {
* this region */
};
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+ struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -331,6 +341,7 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};
struct core_thread {
@@ -552,6 +563,7 @@ enum tlb_flush_reason {
TLB_REMOTE_SHOOTDOWN,
TLB_LOCAL_SHOOTDOWN,
TLB_LOCAL_MM_SHOOTDOWN,
+ TLB_REMOTE_SEND_IPI,
NR_TLB_FLUSH_REASONS,
};
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 16373c8f5f57..437264bda878 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,8 @@ calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
- _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
+ _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
+ _calc_vm_trans(flags, MAP_LOCKONFAULT,VM_LOCKONFAULT);
}
unsigned long vm_commit_limit(void);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 61cd67f4d788..a5b17137c683 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -66,6 +66,16 @@ struct mmu_notifier_ops {
unsigned long end);
/*
+ * clear_young is a lightweight version of clear_flush_young. Like the
+ * latter, it is supposed to test-and-clear the young/accessed bitflag
+ * in the secondary pte, but it may omit flushing the secondary tlb.
+ */
+ int (*clear_young)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
+
+ /*
* test_young is called to check the young/accessed bitflag in
* the secondary pte. This is used to know if the page is
* frequently used without actually clearing the flag or tearing
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_clear_young(mm, start, end);
+ return 0;
+}
+
static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \
})
+#define ptep_clear_young_notify(__vma, __address, __ptep) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
+ ___address + PAGE_SIZE); \
+ __young; \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
+ ___address + PMD_SIZE); \
+ __young; \
+})
+
#define ptep_clear_flush_notify(__vma, __address, __ptep) \
({ \
unsigned long ___addr = __address & PAGE_MASK; \
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
#endif
};
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
- unsigned long start_pfn;
- unsigned long end_pfn;
- int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7deecb7bca5e..03e6257321f0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -13,6 +13,27 @@ struct mem_cgroup;
struct task_struct;
/*
+ * Details of the page allocation that triggered the oom killer that are used to
+ * determine what should be killed.
+ */
+struct oom_control {
+ /* Used to determine cpuset */
+ struct zonelist *zonelist;
+
+ /* Used to determine mempolicy */
+ nodemask_t *nodemask;
+
+ /* Used to determine cpuset and node locality requirement */
+ const gfp_t gfp_mask;
+
+ /*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+ const int order;
+};
+
+/*
* Types of limitations to the nodes from which allocations may occur
*/
enum oom_constraint {
@@ -57,21 +78,18 @@ extern unsigned long oom_badness(struct task_struct *p,
extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);
-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);
-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);
extern void exit_oom_victim(void);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34e040b34e9..f7bb2d41ad93 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,10 @@ enum pageflags {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
PG_compound_lock,
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+ PG_young,
+ PG_idle,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -134,49 +138,68 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+/* Page flags policies wrt compound pages */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageTail(page), page); \
+ else \
+ page = compound_head(page); \
+ page;})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -205,47 +228,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+/* Forward declarations */
+struct page;
+static inline int PageCompound(struct page *page);
+static inline int PageTail(struct page *page);
+
+static inline struct page *compound_head_by_tail(struct page *tail)
+{
+ struct page *head = tail->first_page;
+
+ /*
+ * page->first_page may be a dangling pointer to an old
+ * compound page, so recheck that it is still a tail
+ * page before returning.
+ */
+ smp_rmb();
+ if (likely(PageTail(tail)))
+ return head;
+ return tail;
+}
+
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
+static inline struct page *compound_head(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return compound_head_by_tail(page);
+ return page;
+}
+
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return page->first_page;
+ return page;
+}
+
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND)
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND)
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -258,37 +334,46 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
@@ -311,6 +396,7 @@ PAGEFLAG_FALSE(HWPoison)
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
@@ -323,6 +409,7 @@ static inline int PageAnon(struct page *page)
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
@@ -334,8 +421,9 @@ u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -352,22 +440,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -396,8 +486,8 @@ static inline void set_page_writeback_keepwrite(struct page *page)
* and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
* and avoid handling those in real mode.
*/
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+__PAGEFLAG(Tail, tail, PF_ANY)
static inline int PageCompound(struct page *page)
{
@@ -421,8 +511,8 @@ static inline void ClearPageCompound(struct page *page)
* because PageCompound is always set for compound pages and not for
* pages on the LRU and/or pagecache.
*/
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound)
+TESTPAGEFLAG(Compound, compound, PF_ANY)
+__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY)
/*
* PG_reclaim is used in combination with PG_compound to mark the
@@ -518,21 +608,9 @@ static inline int PageTransTail(struct page *page)
}
#else
-
-static inline int PageTransHuge(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransCompound(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransTail(struct page *page)
-{
- return 0;
-}
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
#endif
/*
@@ -631,15 +709,19 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1 << PG_private | 1 << PG_private_2 | \
1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+ 1 << PG_unevictable | __PG_MLOCKED | \
__PG_COMPOUND_LOCK)
/*
* Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have any flags set. It they are set,
+ * Pages being prepped should not have these flags set. It they are set,
* there has been a kernel bug or struct page corruption.
+ *
+ * __PG_HWPOISON is exceptional because it need to be kept beyond page's
+ * alloc-free cycle to prevent from reusing the page.
*/
-#define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1)
+#define PAGE_FLAGS_CHECK_AT_PREP \
+ (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
#define PAGE_FLAGS_PRIVATE \
(1 << PG_private | 1 << PG_private_2)
@@ -655,6 +737,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index c42981cd99aa..17f118a82854 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -26,6 +26,10 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+ PAGE_EXT_YOUNG,
+ PAGE_EXT_IDLE,
+#endif
};
/*
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a6c78e00ea96..3e95fb6a77af 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -657,17 +648,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/parse-integer.h b/include/linux/parse-integer.h
new file mode 100644
index 000000000000..ba620cdf3df6
--- /dev/null
+++ b/include/linux/parse-integer.h
@@ -0,0 +1,188 @@
+#ifndef _PARSE_INTEGER_H
+#define _PARSE_INTEGER_H
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/*
+ * int parse_integer(const char *s, unsigned int base, T *val);
+ *
+ * Convert integer string representation to an integer.
+ * Range of accepted values equals to that of type T.
+ *
+ * Conversion to unsigned integer accepts sign "+".
+ * Conversion to signed integer accepts sign "+" and sign "-".
+ *
+ * Radix 0 means autodetection: leading "0x" implies radix 16,
+ * leading "0" implies radix 8, otherwise radix is 10.
+ * Autodetection hint works after optional sign, but not before.
+ *
+ * Return number of characters parsed or -E.
+ *
+ * "T=char" case is not supported because -f{un,}signed-char can silently
+ * change range of accepted values.
+ */
+#define parse_integer(s, base, val) \
+({ \
+ const char *_s = (s); \
+ unsigned int _base = (base); \
+ typeof(&(val)[0]) _val = (val); \
+ \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), signed char *), \
+ _parse_integer_sc(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned char *), \
+ _parse_integer_uc(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), short *), \
+ _parse_integer_s(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned short *), \
+ _parse_integer_us(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), int *), \
+ _parse_integer_i(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned int *), \
+ _parse_integer_u(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long *) && sizeof(long) == 4,\
+ _parse_integer_i(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long *) && sizeof(long) == 8,\
+ _parse_integer_ll(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long *) && sizeof(unsigned long) == 4,\
+ _parse_integer_u(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long *) && sizeof(unsigned long) == 8,\
+ _parse_integer_ull(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long long *), \
+ _parse_integer_ll(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long long *),\
+ _parse_integer_ull(_s, _base, (void *)_val), \
+ _parse_integer_link_time_error())))))))))))); \
+})
+/* internal, do not use */
+int _parse_integer_sc(const char *s, unsigned int base, signed char *val);
+int _parse_integer_uc(const char *s, unsigned int base, unsigned char *val);
+int _parse_integer_s(const char *s, unsigned int base, short *val);
+int _parse_integer_us(const char *s, unsigned int base, unsigned short *val);
+int _parse_integer_i(const char *s, unsigned int base, int *val);
+int _parse_integer_u(const char *s, unsigned int base, unsigned int *val);
+int _parse_integer_ll(const char *s, unsigned int base, long long *val);
+int _parse_integer_ull(const char *s, unsigned int base, unsigned long long *val);
+void _parse_integer_link_time_error(void);
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
+#define PARSE_INTEGER_NEWLINE 0x80000000u
+
+/*
+ * Convert integer string representation terminated by \n\0 or \0 to an integer.
+ *
+ * Return 0 on success or -E.
+ *
+ * See parse_integer().
+ */
+static inline int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoll(const char *s, unsigned int base, long long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoint(const char *s, unsigned int base, int *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
+{
+ return kstrtoull(s, base, res);
+}
+
+static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
+{
+ return kstrtoll(s, base, res);
+}
+
+static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
+{
+ return kstrtouint(s, base, res);
+}
+
+static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
+{
+ return kstrtoint(s, base, res);
+}
+
+static inline int __must_check kstrtou16(const char *s, unsigned int base, u16 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtos16(const char *s, unsigned int base, s16 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtou8(const char *s, unsigned int base, u8 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtos8(const char *s, unsigned int base, s8 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
+int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
+int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
+int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
+int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
+int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
+int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
+int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
+int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+
+static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
+{
+ return kstrtoull_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
+{
+ return kstrtoll_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
+{
+ return kstrtouint_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
+{
+ return kstrtoint_from_user(s, count, base, res);
+}
+#endif
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2a..7b2a7fcde6a3 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -32,6 +32,10 @@
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index a6298b27ac99..9729565c25ff 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -404,10 +404,10 @@ do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \
if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
__ratelimit(&_rs)) \
- __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__); \
+ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...) \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
groupsize, buf, len, ascii) \
dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii) \
print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+}
+#endif
#endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..0860336c6c40 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,10 +85,14 @@ enum ttu_flags {
TTU_UNMAP = 1, /* unmap mode */
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
+ TTU_FREE = 8, /* free mode */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ * and caller guarantees they will
+ * do a final flush if necessary */
};
#ifdef CONFIG_MMU
@@ -183,7 +187,8 @@ static inline void page_dup_rmap(struct page *page)
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
+ struct mem_cgroup *memcg, unsigned long *vm_flags,
+ int *is_pte_dirty);
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
@@ -260,9 +265,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
static inline int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
return 0;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b5ada460b4..52f981266f2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1341,6 +1341,25 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ */
+ struct cpumask cpumask;
+
+ /* True if any bit in cpumask is set */
+ bool flush_required;
+
+ /*
+ * If true then the PTE was dirty when unmapped. The entry must be
+ * flushed before IO is initiated or a stale TLB entry potentially
+ * allows an update without redirtying the page.
+ */
+ bool writable;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1699,6 +1718,10 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
struct rcu_head rcu;
/*
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..ac19c95a03af 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -122,6 +122,10 @@ int seq_write(struct seq_file *seq, const void *data, size_t len);
__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii);
+
int seq_path(struct seq_file *, const struct path *, const char *);
int seq_file_path(struct seq_file *, struct file *, const char *);
int seq_dentry(struct seq_file *, struct dentry *, const char *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
void kmem_cache_free(struct kmem_cache *, void *);
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
const char *thread_comm;
};
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ return smpboot_register_percpu_thread_cpumask(plug_thread,
+ cpu_possible_mask);
+}
+
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *);
diff --git a/include/linux/string.h b/include/linux/string.h
index a8d90db9c4b0..d5dfe3e75572 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -118,6 +118,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38874729dc5f..9c7c4b418498 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -154,7 +154,7 @@ enum {
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
};
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
@@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
@@ -351,7 +352,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
extern int kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+ /* root ? */
+ if (mem_cgroup_disabled() || !memcg->css.parent)
+ return vm_swappiness;
+
+ return memcg->swappiness;
+}
+
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -398,6 +407,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
extern struct page *lookup_swap_cache(swp_entry_t);
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
+extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated);
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
@@ -431,6 +443,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
@@ -522,6 +535,11 @@ static inline int page_swapcount(struct page *page)
return 0;
}
+static inline int swp_swapcount(swp_entry_t entry)
+{
+ return 0;
+}
+
#define reuse_swap_page(page) (page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..cb2a0ef75b7d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
@@ -884,4 +885,8 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp, int flags);
+asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
+asmlinkage long sys_munlock2(unsigned long start, size_t len, int flags);
+asmlinkage long sys_munlockall2(int flags);
+
#endif
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..587480ad41b7
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/userfaultfd_k.h
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason);
+
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9246d32dc973..2b1cef88b827 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
+ PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
typedef int wait_bit_action_f(struct wait_bit_key *);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
#define wake_up_locked_poll(x, m) \
- __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+ __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
#define wake_up_interruptible_sync_poll(x, m) \
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index f9d41a6e361f..e183a0a65ac1 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -9,7 +9,7 @@ struct zbud_ops {
int (*evict)(struct zbud_pool *pool, unsigned long handle);
};
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
void zbud_destroy_pool(struct zbud_pool *pool);
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index d30eff3d84d5..c924a28d9805 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -37,7 +37,7 @@ enum zpool_mapmode {
};
struct zpool *zpool_create_pool(char *type, char *name,
- gfp_t gfp, struct zpool_ops *ops);
+ gfp_t gfp, const struct zpool_ops *ops);
char *zpool_get_type(struct zpool *pool);
@@ -81,7 +81,7 @@ struct zpool_driver {
atomic_t refcount;
struct list_head list;
- void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+ void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
struct zpool *zpool);
void (*destroy)(void *pool);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 1338190b5478..6398dfae53f1 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -34,6 +34,11 @@ enum zs_mapmode {
*/
};
+struct zs_pool_stats {
+ /* How many pages were migrated (freed) */
+ unsigned long pages_compacted;
+};
+
struct zs_pool;
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
@@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
unsigned long zs_get_total_pages(struct zs_pool *pool);
unsigned long zs_compact(struct zs_pool *pool);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 4353ef70bf48..f4a654be5c3c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1042,34 +1042,6 @@ struct proto {
#endif
};
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
- /* Currently active and new sockets should be assigned to cgroups */
- MEMCG_SOCK_ACTIVE,
- /* It was ever activated; we must disarm static keys on destruction */
- MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
- struct page_counter memory_allocated; /* Current allocated memory. */
- struct percpu_counter sockets_allocated; /* Current number of sockets. */
- int memory_pressure;
- long sysctl_mem[3];
- unsigned long flags;
- /*
- * memcg field is used to find which memcg we belong directly
- * Each memcg struct can hold more than one cg_proto, so container_of
- * won't really cut.
- *
- * The elegant solution would be having an inverse function to
- * proto_cgroup in struct proto, but that means polluting the structure
- * for everybody, instead of just for memcg users.
- */
- struct mem_cgroup *memcg;
-};
-
int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \
EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \
EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \
- EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" )
+ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \
+ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" )
/*
* First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ddc3b36f1046..7a94102b7a02 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index e9fe6fd2a074..007b7841fb48 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -12,10 +12,15 @@
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+#define MAP_LOCKONFAULT 0x80000 /* Lock pages after they are faulted in, do not prefault */
/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+#define MLOCK_LOCKED 0x01 /* Lock and populate the specified range */
+#define MLOCK_ONFAULT 0x02 /* Lock pages in range after they are faulted in, do not prefault */
#endif /* __ASM_GENERIC_MMAN_H */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9b1a04..e759fa2ea50d 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,15 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_mlock2 282
+__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_munlock2 283
+__SYSCALL(__NR_munlock2, sys_munlock2)
+#define __NR_munlockall2 284
+__SYSCALL(__NR_munlockall2, sys_munlockall2)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 285
/*
* All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 67a4c60e1deb..d358de12175c 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
header-y += xilinx-v4l2-controls.h
header-y += zorro.h
header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index a6c4962e5d46..5da5f8751ce7 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -33,6 +33,7 @@
#define KPF_THP 22
#define KPF_BALLOON 23
#define KPF_ZERO_PAGE 24
+#define KPF_IDLE 25
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
#define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS))
#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE 6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+ (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \
issecure_mask(SECURE_NO_SETUID_FIXUP) | \
- issecure_mask(SECURE_KEEP_CAPS))
+ issecure_mask(SECURE_KEEP_CAPS) | \
+ issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
#endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..df0e09bb7dd5
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#include <linux/compiler.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ * UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ } pagefault;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK 0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#endif
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * There will be a wrprotection flag later that allows to map
+ * pages wrprotected on the fly. And such a flag will be
+ * available if the wrprotection ioctl are implemented for the
+ * range according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index 20b97eac6c7e..3d5d869a6984 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -896,6 +896,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
bool
#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ bool
+
+#
# For architectures that know their GCC __int128 support is sound
#
config ARCH_SUPPORTS_INT128
@@ -1660,6 +1670,17 @@ config ADVISE_SYSCALLS
applications use these syscalls, you can disable this option to save
space.
+config USERFAULTFD
+ bool "Enable userfaultfd() system call"
+ select ANON_INODES
+ default y
+ depends on MMU
+ help
+ Enable the userfaultfd() system call that allows to intercept and
+ handle page faults in userland.
+
+ If unsure, say Y.
+
config PCI_QUIRKS
default y
bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..b32ad7d97ac9 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
static void __init free_initrd(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
#endif
if (do_retain_initrd)
goto skip;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
key_init();
security_init();
dbg_late_init();
- vfs_caches_init(totalram_pages);
+ vfs_caches_init();
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a24ba9fe5bb8..161a1807e6ef 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -142,7 +142,6 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
if (!leaf)
return -ENOMEM;
INIT_LIST_HEAD(&leaf->msg_list);
- info->qsize += sizeof(*leaf);
}
leaf->priority = msg->m_type;
rb_link_node(&leaf->rb_node, parent, p);
@@ -187,7 +186,6 @@ try_again:
"lazy leaf delete!\n");
rb_erase(&leaf->rb_node, &info->msg_tree);
if (info->node_cache) {
- info->qsize -= sizeof(*leaf);
kfree(leaf);
} else {
info->node_cache = leaf;
@@ -200,7 +198,6 @@ try_again:
if (list_empty(&leaf->msg_list)) {
rb_erase(&leaf->rb_node, &info->msg_tree);
if (info->node_cache) {
- info->qsize -= sizeof(*leaf);
kfree(leaf);
} else {
info->node_cache = leaf;
@@ -1034,7 +1031,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
/* Save our speculative allocation into the cache */
INIT_LIST_HEAD(&new_leaf->msg_list);
info->node_cache = new_leaf;
- info->qsize += sizeof(*new_leaf);
new_leaf = NULL;
} else {
kfree(new_leaf);
@@ -1142,7 +1138,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
/* Save our speculative allocation into the cache */
INIT_LIST_HEAD(&new_leaf->msg_list);
info->node_cache = new_leaf;
- info->qsize += sizeof(*new_leaf);
} else {
kfree(new_leaf);
}
diff --git a/ipc/msg.c b/ipc/msg.c
index 66c4f567eb73..f675689290ca 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 2b491590ebab..71f448e5e927 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
size_t len = src->m_ts;
size_t alen;
- BUG_ON(dst == NULL);
+ WARN_ON(dst == NULL);
if (src->m_ts > dst->m_ts)
return ERR_PTR(-EINVAL);
diff --git a/ipc/shm.c b/ipc/shm.c
index 06e5cf2fe019..222131e8e38f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
* We raced in the idr lookup or with shm_destroy(). Either way, the
* ID is busted.
*/
- BUG_ON(IS_ERR(ipcp));
+ WARN_ON(IS_ERR(ipcp));
return container_of(ipcp, struct shmid_kernel, shm_perm);
}
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
- BUG_ON(!sfd->vm_ops->fault);
+ WARN_ON(!sfd->vm_ops->fault);
#endif
vma->vm_ops = &shm_vm_ops;
shm_open(vma);
@@ -545,7 +545,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if ((shmflg & SHM_NORESERVE) &&
sysctl_overcommit_memory != OVERCOMMIT_NEVER)
acctflag = VM_NORESERVE;
- file = shmem_file_setup(name, size, acctflag);
+ file = shmem_kernel_file_setup(name, size, acctflag);
}
error = PTR_ERR(file);
if (IS_ERR(file))
diff --git a/kernel/Makefile b/kernel/Makefile
index 718fb8afab7a..aff8da5d82c1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,7 +50,9 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..86fadcdac672 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5816,6 +5816,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ flags |= MAP_LOCKONFAULT;
if (vma->vm_flags & VM_HUGETLB)
flags |= MAP_HUGETLB;
@@ -8872,7 +8874,7 @@ static void perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..882c9f679243 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -201,7 +201,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
munlock_vma_page(page);
put_page(page);
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
-#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index ab0036cb840f..f5e1ef7d3c97 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT |
+ VM_UFFD_MISSING | VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..a6bf3382560d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,20 @@
/*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
-
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in do_exit() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses. On processors
- * where you can disable the MMU this is trivial, and easy. For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place. This means I can only support memory whose
- * physical address can fit in an unsigned long. In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages. As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it). The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- * - allocating a page table with the control code buffer identity
- * mapped, to simplify machine_kexec and make kexec_on_panic more
- * reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long dest);
+#include <linux/slab.h>
+#include "kexec_internal.h"
static int copy_user_segment_list(struct kimage *image,
unsigned long nr_segments,
@@ -169,125 +33,6 @@ static int copy_user_segment_list(struct kimage *image,
return ret;
}
-static int sanity_check_segment_list(struct kimage *image)
-{
- int result, i;
- unsigned long nr_segments = image->nr_segments;
-
- /*
- * Verify we have good destination addresses. The caller is
- * responsible for making certain we don't attempt to load
- * the new image into invalid or reserved areas of RAM. This
- * just verifies it is an address we can use.
- *
- * Since the kernel does everything in page size chunks ensure
- * the destination addresses are page aligned. Too many
- * special cases crop of when we don't do this. The most
- * insidious is getting overlapping destination addresses
- * simply because addresses are changed to page size
- * granularity.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
- if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
- }
-
- /* Verify our destination addresses do not overlap.
- * If we alloed overlapping destination addresses
- * through very weird things can happen with no
- * easy explanation as one segment stops on another.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
- unsigned long j;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- for (j = 0; j < i; j++) {
- unsigned long pstart, pend;
- pstart = image->segment[j].mem;
- pend = pstart + image->segment[j].memsz;
- /* Do the segments overlap ? */
- if ((mend > pstart) && (mstart < pend))
- return result;
- }
- }
-
- /* Ensure our buffer sizes are strictly less than
- * our memory sizes. This should always be the case,
- * and it is easier to check up front than to be surprised
- * later on.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
- }
-
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
-
- if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
- }
- }
-
- return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
- struct kimage *image;
-
- /* Allocate a controlling structure */
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- return NULL;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unusable_pages);
-
- return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments,
@@ -354,873 +99,6 @@ out_free_image:
return ret;
}
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
- struct fd f = fdget(fd);
- int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
- struct purgatory_info *pi = &image->purgatory_info;
-
- vfree(image->kernel_buf);
- image->kernel_buf = NULL;
-
- vfree(image->initrd_buf);
- image->initrd_buf = NULL;
-
- kfree(image->cmdline_buf);
- image->cmdline_buf = NULL;
-
- vfree(pi->purgatory_buf);
- pi->purgatory_buf = NULL;
-
- vfree(pi->sechdrs);
- pi->sechdrs = NULL;
-
- /* See if architecture has anything to cleanup post load */
- arch_kimage_file_post_load_cleanup(image);
-
- /*
- * Above call should have called into bootloader to free up
- * any data stored in kimage->image_loader_data. It should
- * be ok now to free it up.
- */
- kfree(image->image_loader_data);
- image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
- const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned flags)
-{
- int ret = 0;
- void *ldata;
-
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
- if (ret)
- return ret;
-
- /* Call arch image probe handlers */
- ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
- image->kernel_buf_len);
-
- if (ret)
- goto out;
-
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
- goto out;
- }
- pr_debug("kernel signature verification successful.\n");
-#endif
- /* It is possible that there no initramfs is being loaded */
- if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
- if (ret)
- goto out;
- }
-
- if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- image->cmdline_buf_len = cmdline_len;
-
- /* command line should be a string with last byte null */
- if (image->cmdline_buf[cmdline_len - 1] != '\0') {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
-
- if (IS_ERR(ldata)) {
- ret = PTR_ERR(ldata);
- goto out;
- }
-
- image->image_loader_data = ldata;
-out:
- /* In case of error, free up all allocated memory in this function */
- if (ret)
- kimage_file_post_load_cleanup(image);
- return ret;
-}
-
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
- int initrd_fd, const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned long flags)
-{
- int ret;
- struct kimage *image;
- bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
- image = do_kimage_alloc_init();
- if (!image)
- return -ENOMEM;
-
- image->file_mode = 1;
-
- if (kexec_on_panic) {
- /* Enable special crash kernel control page alloc policy. */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
- }
-
- ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
- cmdline_ptr, cmdline_len, flags);
- if (ret)
- goto out_free_image;
-
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_post_load_bufs;
-
- ret = -ENOMEM;
- image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_PAGE_SIZE));
- if (!image->control_code_page) {
- pr_err("Could not allocate control_code_buffer\n");
- goto out_free_post_load_bufs;
- }
-
- if (!kexec_on_panic) {
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free_control_pages;
- }
- }
-
- *rimage = image;
- return 0;
-out_free_control_pages:
- kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
- kimage_file_post_load_cleanup(image);
-out_free_image:
- kfree(image);
- return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start,
- unsigned long end)
-{
- unsigned long i;
-
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
- return 1;
- }
-
- return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *pages;
-
- pages = alloc_pages(gfp_mask, order);
- if (pages) {
- unsigned int count, i;
- pages->mapping = NULL;
- set_page_private(pages, order);
- count = 1 << order;
- for (i = 0; i < count; i++)
- SetPageReserved(pages + i);
- }
-
- return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
- unsigned int order, count, i;
-
- order = page_private(page);
- count = 1 << order;
- for (i = 0; i < count; i++)
- ClearPageReserved(page + i);
- __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
- struct list_head *pos, *next;
-
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
- list_del(&page->lru);
- kimage_free_pages(page);
- }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * At worst this runs in O(N) of the image size.
- */
- struct list_head extra_pages;
- struct page *pages;
- unsigned int count;
-
- count = 1 << order;
- INIT_LIST_HEAD(&extra_pages);
-
- /* Loop while I can allocate a page and the page allocated
- * is a destination page.
- */
- do {
- unsigned long pfn, epfn, addr, eaddr;
-
- pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
- if (!pages)
- break;
- pfn = page_to_pfn(pages);
- epfn = pfn + count;
- addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
- if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr)) {
- list_add(&pages->lru, &extra_pages);
- pages = NULL;
- }
- } while (!pages);
-
- if (pages) {
- /* Remember the allocated page... */
- list_add(&pages->lru, &image->control_pages);
-
- /* Because the page is already in it's destination
- * location we will never allocate another page at
- * that address. Therefore kimage_alloc_pages
- * will not return it (again) and we don't need
- * to give it an entry in image->segment[].
- */
- }
- /* Deal with the destination pages I have inadvertently allocated.
- *
- * Ideally I would convert multi-page allocations into single
- * page allocations, and add everything to image->dest_pages.
- *
- * For now it is simpler to just free the pages.
- */
- kimage_free_page_list(&extra_pages);
-
- return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * Control pages are also the only pags we must allocate
- * when loading a crash kernel. All of the other pages
- * are specified by the segments and we just memcpy
- * into them directly.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * Given the low demand this implements a very simple
- * allocator that finds the first hole of the appropriate
- * size in the reserved memory region, and allocates all
- * of the memory up to and including the hole.
- */
- unsigned long hole_start, hole_end, size;
- struct page *pages;
-
- pages = NULL;
- size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
- unsigned long i;
-
- if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
- break;
- /* See if I overlap any of the segments */
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- if ((hole_end >= mstart) && (hole_start <= mend)) {
- /* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- break;
- }
- }
- /* If I don't overlap any segments I have found my hole! */
- if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- break;
- }
- }
- if (pages)
- image->control_page = hole_end;
-
- return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
- unsigned int order)
-{
- struct page *pages = NULL;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- pages = kimage_alloc_normal_control_pages(image, order);
- break;
- case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
- break;
- }
-
- return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
- if (*image->entry != 0)
- image->entry++;
-
- if (image->entry == image->last_entry) {
- kimage_entry_t *ind_page;
- struct page *page;
-
- page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page)
- return -ENOMEM;
-
- ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
- image->entry = ind_page;
- image->last_entry = ind_page +
- ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
- }
- *image->entry = entry;
- image->entry++;
- *image->entry = 0;
-
- return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
- unsigned long destination)
-{
- int result;
-
- destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
-
- return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
- int result;
-
- page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
-
- return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
- /* Walk through and free any extra destination pages I may have */
- kimage_free_page_list(&image->dest_pages);
-
- /* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
- if (*image->entry != 0)
- image->entry++;
-
- *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
- for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
- struct page *page;
-
- page = pfn_to_page(entry >> PAGE_SHIFT);
- kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
- kimage_entry_t *ptr, entry;
- kimage_entry_t ind = 0;
-
- if (!image)
- return;
-
- kimage_free_extra_pages(image);
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_INDIRECTION) {
- /* Free the previous indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
- /* Save this indirection page until we are
- * done with it.
- */
- ind = entry;
- } else if (entry & IND_SOURCE)
- kimage_free_entry(entry);
- }
- /* Free the final indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
-
- /* Handle any machine specific cleanup */
- machine_kexec_cleanup(image);
-
- /* Free the kexec control pages... */
- kimage_free_page_list(&image->control_pages);
-
- /*
- * Free up any temporary buffers allocated. This might hit if
- * error occurred much later after buffer allocation.
- */
- if (image->file_mode)
- kimage_file_post_load_cleanup(image);
-
- kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
- unsigned long page)
-{
- kimage_entry_t *ptr, entry;
- unsigned long destination = 0;
-
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION)
- destination = entry & PAGE_MASK;
- else if (entry & IND_SOURCE) {
- if (page == destination)
- return ptr;
- destination += PAGE_SIZE;
- }
- }
-
- return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long destination)
-{
- /*
- * Here we implement safeguards to ensure that a source page
- * is not copied to its destination page before the data on
- * the destination page is no longer useful.
- *
- * To do this we maintain the invariant that a source page is
- * either its own destination page, or it is not a
- * destination page at all.
- *
- * That is slightly stronger than required, but the proof
- * that no problems will not occur is trivial, and the
- * implementation is simply to verify.
- *
- * When allocating all pages normally this algorithm will run
- * in O(N) time, but in the worst case it will run in O(N^2)
- * time. If the runtime is a problem the data structures can
- * be fixed.
- */
- struct page *page;
- unsigned long addr;
-
- /*
- * Walk through the list of destination pages, and see if I
- * have a match.
- */
- list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
- if (addr == destination) {
- list_del(&page->lru);
- return page;
- }
- }
- page = NULL;
- while (1) {
- kimage_entry_t *old;
-
- /* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
- if (!page)
- return NULL;
- /* If the page cannot be used file it away */
- if (page_to_pfn(page) >
- (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unusable_pages);
- continue;
- }
- addr = page_to_pfn(page) << PAGE_SHIFT;
-
- /* If it is the destination page we want use it */
- if (addr == destination)
- break;
-
- /* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
- break;
-
- /*
- * I know that the page is someones destination page.
- * See if there is already a source page for this
- * destination page. And if so swap the source pages.
- */
- old = kimage_dst_used(image, addr);
- if (old) {
- /* If so move it */
- unsigned long old_addr;
- struct page *old_page;
-
- old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
- copy_highpage(page, old_page);
- *old = addr | (*old & ~PAGE_MASK);
-
- /* The old page I have found cannot be a
- * destination page, so return it if it's
- * gfp_flags honor the ones passed in.
- */
- if (!(gfp_mask & __GFP_HIGHMEM) &&
- PageHighMem(old_page)) {
- kimage_free_pages(old_page);
- continue;
- }
- addr = old_addr;
- page = old_page;
- break;
- } else {
- /* Place the page on the destination list I
- * will use it later.
- */
- list_add(&page->lru, &image->dest_pages);
- }
- }
-
- return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
-
- result = kimage_set_destination(image, maddr);
- if (result < 0)
- goto out;
-
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- result = kimage_add_page(image, page_to_pfn(page)
- << PAGE_SHIFT);
- if (result < 0)
- goto out;
-
- ptr = kmap(page);
- /* Start with a clear page */
- clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
- * We do things a page at a time for the sake of kmap.
- */
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
- if (mchunk > uchunk) {
- /* Zero the trailing part of the page */
- memset(ptr + uchunk, 0, mchunk - uchunk);
- }
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kexec_flush_icache_page(page);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- int result = -ENOMEM;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
- break;
- case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
- break;
- }
-
- return result;
-}
-
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -1241,11 +119,6 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +213,6 @@ out:
return result;
}
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, nr_segments,
@@ -1390,1391 +251,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
- unsigned long, cmdline_len, const char __user *, cmdline_ptr,
- unsigned long, flags)
-{
- int ret = 0, i;
- struct kimage **dest_image, *image;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
- return -EPERM;
-
- /* Make sure we have a legal set of flags */
- if (flags != (flags & KEXEC_FILE_FLAGS))
- return -EINVAL;
-
- image = NULL;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
- dest_image = &kexec_crash_image;
-
- if (flags & KEXEC_FILE_UNLOAD)
- goto exchange;
-
- /*
- * In case of crash, new kernel gets loaded in reserved region. It is
- * same memory where old crash kernel might be loaded. Free any
- * current crash dump kernel before we corrupt it.
- */
- if (flags & KEXEC_FILE_ON_CRASH)
- kimage_free(xchg(&kexec_crash_image, NULL));
-
- ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
- cmdline_len, flags);
- if (ret)
- goto out;
-
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
-
- ret = kexec_calculate_store_digests(image);
- if (ret)
- goto out;
-
- for (i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
-
- ret = kimage_load_segment(image, &image->segment[i]);
- if (ret)
- goto out;
- }
-
- kimage_terminate(image);
-
- /*
- * Free up any temporary buffers allocated which are not needed
- * after image has been loaded
- */
- kimage_file_post_load_cleanup(image);
-exchange:
- image = xchg(dest_image, image);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
- return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_map_reserved_pages();
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
-
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- crash_notes = alloc_percpu(note_buf_t);
- if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
-
- do {
- /* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
-
- if (temp_start < start || temp_start < kbuf->buf_min)
- return 0;
-
- temp_end = temp_start + kbuf->memsz - 1;
-
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start - PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_start = max(start, kbuf->buf_min);
-
- do {
- temp_start = ALIGN(temp_start, kbuf->buf_align);
- temp_end = temp_start + kbuf->memsz - 1;
-
- if (temp_end > end || temp_end > kbuf->buf_max)
- return 0;
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start + PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
- struct kexec_buf *kbuf = (struct kexec_buf *)arg;
- unsigned long sz = end - start + 1;
-
- /* Returning 0 will take to next memory range */
- if (sz < kbuf->memsz)
- return 0;
-
- if (end < kbuf->buf_min || start > kbuf->buf_max)
- return 0;
-
- /*
- * Allocate memory top down with-in ram range. Otherwise bottom up
- * allocation.
- */
- if (kbuf->top_down)
- return locate_mem_hole_top_down(start, end, kbuf);
- return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
-{
-
- struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
- int ret;
-
- /* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
- return -EINVAL;
-
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
- /*
- * Make sure we are not trying to add buffer after allocating
- * control pages. All segments need to be placed first before
- * any control pages are allocated. As control page allocation
- * logic goes through list of segments to make sure there are
- * no destination overlaps.
- */
- if (!list_empty(&image->control_pages)) {
- WARN_ON(1);
- return -EINVAL;
- }
-
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
-
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
-
- /* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = kbuf->buffer;
- ksegment->bufsz = kbuf->bufsz;
- ksegment->mem = kbuf->mem;
- ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
- return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
- void *zero_buf;
- struct kexec_sha_region *sha_regions;
- struct purgatory_info *pi = &image->purgatory_info;
-
- zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
- zero_buf_sz = PAGE_SIZE;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
- sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
- sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
- goto out_free_desc;
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
-
- for (j = i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- /*
- * Skip purgatory as it will be modified once we put digest
- * info in purgatory.
- */
- if (ksegment->kbuf == pi->purgatory_buf)
- continue;
-
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
-
- /*
- * Assume rest of the buffer is filled with zero and
- * update digest accordingly.
- */
- nullsz = ksegment->memsz - ksegment->bufsz;
- while (nullsz) {
- unsigned long bytes = nullsz;
-
- if (bytes > zero_buf_sz)
- bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
- nullsz -= bytes;
- }
-
- if (ret)
- break;
-
- sha_regions[j].start = ksegment->mem;
- sha_regions[j].len = ksegment->memsz;
- j++;
- }
-
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
-
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
-
-out_free_digest:
- kfree(digest);
-out_free_sha_regions:
- vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
- return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
-
- /* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
- bss_align = 1;
- buf_sz = 0;
- bss_sz = 0;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
- } else {
- /* bss section */
- if (bss_align < align)
- bss_align = align;
- bss_sz = ALIGN(bss_sz, align);
- bss_sz += sechdrs[i].sh_size;
- }
- }
-
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
- memsz = buf_sz + bss_pad + bss_sz;
-
- /* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (buf_align < bss_align)
- buf_align = bss_align;
-
- /* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
- if (ret)
- goto out;
-
- /* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
- bss_addr = ALIGN(bss_addr, align);
- sechdrs[i].sh_addr = bss_addr;
- bss_addr += sechdrs[i].sh_size;
- }
- }
-
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
-
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
-
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
-
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = purgatory_buf;
- return ret;
-out:
- vfree(sechdrs);
- vfree(purgatory_buf);
- return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
- int i, ret;
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
-
- /* Apply relocations */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
-
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
- continue;
-
- /*
- * For section of type SHT_RELA/SHT_REL,
- * ->sh_link contains section header index of associated
- * symbol table. And ->sh_info contains section header
- * index of section to which relocations apply.
- */
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
- return -ENOEXEC;
-
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
-
- if (!(section->sh_flags & SHF_ALLOC))
- continue;
-
- /*
- * symtab->sh_link contain section header index of associated
- * string table.
- */
- if (symtab->sh_link >= pi->ehdr->e_shnum)
- /* Invalid section number? */
- continue;
-
- /*
- * Respective architecture needs to provide support for applying
- * relocations of type SHT_RELA/SHT_REL.
- */
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- int ret;
-
- if (kexec_purgatory_size <= 0)
- return -EINVAL;
-
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
-
- ret = __kexec_load_purgatory(image, min, max, top_down);
- if (ret)
- return ret;
-
- ret = kexec_apply_relocations(image);
- if (ret)
- goto out;
-
- *load_addr = pi->purgatory_load_addr;
- return 0;
-out:
- vfree(pi->sechdrs);
- vfree(pi->purgatory_buf);
- return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
- const char *strtab;
-
- if (!pi->sechdrs || !pi->ehdr)
- return NULL;
-
- sechdrs = pi->sechdrs;
- ehdr = pi->ehdr;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
- Elf_Shdr *sechdr;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return ERR_PTR(-EINVAL);
-
- sechdr = &pi->sechdrs[sym->st_shndx];
-
- /*
- * Returns the address where symbol will finally be loaded after
- * kexec_load_segment()
- */
- return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
- void *buf, unsigned int size, bool get_value)
-{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
- struct purgatory_info *pi = &image->purgatory_info;
- char *sym_buf;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return -EINVAL;
-
- if (sym->st_size != size) {
- pr_err("symbol %s size mismatch: expected %lu actual %u\n",
- name, (unsigned long)sym->st_size, size);
- return -EINVAL;
- }
-
- sechdrs = pi->sechdrs;
-
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- pr_err("symbol %s is in a bss section. Cannot %s\n", name,
- get_value ? "get" : "set");
- return -EINVAL;
- }
-
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
-
- if (get_value)
- memcpy((void *)buf, sym_buf, size);
- else
- memcpy((void *)sym_buf, buf, size);
-
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
- int error = 0;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
- if (!kexec_image) {
- error = -EINVAL;
- goto Unlock;
- }
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- lock_system_sleep();
- pm_prepare_console();
- error = freeze_processes();
- if (error) {
- error = -EBUSY;
- goto Restore_console;
- }
- suspend_console();
- error = dpm_suspend_start(PMSG_FREEZE);
- if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
- */
- error = dpm_suspend_end(PMSG_FREEZE);
- if (error)
- goto Resume_devices;
- error = disable_nonboot_cpus();
- if (error)
- goto Enable_cpus;
- local_irq_disable();
- error = syscore_suspend();
- if (error)
- goto Enable_irqs;
- } else
-#endif
- {
- kexec_in_progress = true;
- kernel_restart_prepare(NULL);
- migrate_to_reboot_cpu();
-
- /*
- * migrate_to_reboot_cpu() disables CPU hotplug assuming that
- * no further code needs to use CPU hotplug (which is true in
- * the reboot case). However, the kexec path depends on using
- * CPU hotplug again; so re-enable it here.
- */
- cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
- machine_shutdown();
- }
-
- machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- syscore_resume();
- Enable_irqs:
- local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
- dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
- dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
- thaw_processes();
- Restore_console:
- pm_restore_console();
- unlock_system_sleep();
- }
-#endif
-
- Unlock:
- mutex_unlock(&kexec_mutex);
- return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..9aa25c034b2e
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1511 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return result;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return result;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return result;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return result;
+ }
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ break;
+ }
+ }
+ if (pages)
+ image->control_page = hole_end;
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ /* Place the page on the destination list, to be used later */
+ list_add(&page->lru, &image->dest_pages);
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+size_t crash_get_memory_size(void)
+{
+ size_t size = 0;
+
+ mutex_lock(&kexec_mutex);
+ if (crashk_res.end != crashk_res.start)
+ size = resource_size(&crashk_res);
+ mutex_unlock(&kexec_mutex);
+ return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
+
+ mutex_lock(&kexec_mutex);
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ start = crashk_res.start;
+ end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+ end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+ crash_map_reserved_pages();
+ crash_free_reserved_phys_range(end, crashk_res.end);
+
+ if ((start == end) && (crashk_res.parent != NULL))
+ release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
+ crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
+ crash_unmap_reserved_pages();
+
+unlock:
+ mutex_unlock(&kexec_mutex);
+ return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ crash_notes = alloc_percpu(note_buf_t);
+ if (!crash_notes) {
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ lock_system_sleep();
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare(NULL);
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ unlock_system_sleep();
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..caf47e92e12e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1044 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..d0435e49aca0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -246,15 +246,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/signal.c b/kernel/signal.c
index 836df8dac6cc..0f6bbbe77b46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
#ifdef SEGV_BNDERR
- err |= __put_user(from->si_lower, &to->si_lower);
- err |= __put_user(from->si_upper, &to->si_upper);
+ if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+ }
#endif
break;
case __SI_CHLD:
@@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
int ret = copy_siginfo_from_user32(&info, uinfo);
if (unlikely(ret))
return ret;
@@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
- /* Unpark any threads that were voluntarily parked. */
- for_each_cpu_not(cpu, ht->cpumask) {
- if (cpu_online(cpu)) {
- struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
- kthread_unpark(tsk);
- }
- }
-
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+ cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -311,7 +308,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..4a4ba2066433 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -193,6 +193,9 @@ cond_syscall(sys_mlock);
cond_syscall(sys_munlock);
cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
+cond_syscall(sys_munlock2);
+cond_syscall(sys_munlockall2);
cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
@@ -218,6 +221,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
{
.procname = "kexec_load_disabled",
.data = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..dab0f808235a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..d18330fa4776 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -713,15 +713,12 @@ static int watchdog_enable_all_cpus(void)
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else {
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask))
- pr_err("Failed to set cpumask for watchdog threads\n");
+ else
watchdog_running = 1;
- }
} else {
/*
* Enable/disable the lockup detectors or
@@ -932,10 +929,8 @@ void __init lockup_detector_init(void)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
- if (!cpumask_empty(tick_nohz_full_mask))
- pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
- tick_nohz_full_mask);
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
diff --git a/lib/Kconfig b/lib/Kconfig
index 3a2ef67db6c7..a4766fee0017 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -188,6 +188,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 715e22fd0dd8..a24275f039cf 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1682,6 +1682,9 @@ config TEST_HEXDUMP
config TEST_STRING_HELPERS
tristate "Test functions located in the string_helpers module at runtime"
+config TEST_PARSE_INTEGER
+ tristate "Test parse_integer() function at runtime"
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index 6897b527581a..1356b0e974bc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -32,9 +32,11 @@ obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o
obj-y += kstrtox.o
+obj-y += parse-integer.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+obj-$(CONFIG_TEST_PARSE_INTEGER) += test-parse-integer.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LKM) += test_module.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
@@ -79,6 +81,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/bitmap.c b/lib/bitmap.c
index a578a0189199..814814397cce 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
nchunks = nbits = totaldigits = c = 0;
do {
- chunk = ndigits = 0;
+ chunk = 0;
+ ndigits = totaldigits;
/* Get the next chunk of the bitmap */
while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
return -EOVERFLOW;
chunk = (chunk << 4) | hex_to_bin(c);
- ndigits++; totaldigits++;
+ totaldigits++;
}
- if (ndigits == 0)
+ if (ndigits == totaldigits)
return -EINVAL;
if (nchunks == 0 && chunk == 0)
continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
int nmaskbits)
{
unsigned a, b;
- int c, old_c, totaldigits;
+ int c, old_c, totaldigits, ndigits;
const char __user __force *ubuf = (const char __user __force *)buf;
int at_start, in_range;
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
at_start = 1;
in_range = 0;
a = b = 0;
+ ndigits = totaldigits;
/* Get the next cpu# or a range of cpu#'s */
while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
if (isspace(c))
continue;
- /*
- * If the last character was a space and the current
- * character isn't '\0', we've got embedded whitespace.
- * This is a no-no, so throw an error.
- */
- if (totaldigits && c && isspace(old_c))
- return -EINVAL;
-
/* A '\0' or a ',' signal the end of a cpu# or range */
if (c == '\0' || c == ',')
break;
+ /*
+ * whitespaces between digits are not allowed,
+ * but it's ok if whitespaces are on head or tail.
+ * when old_c is whilespace,
+ * if totaldigits == ndigits, whitespace is on head.
+ * if whitespace is on tail, it should not run here.
+ * as c was ',' or '\0',
+ * the last code line has broken the current loop.
+ */
+ if ((totaldigits != ndigits) && isspace(old_c))
+ return -EINVAL;
if (c == '-') {
if (at_start || in_range)
return -EINVAL;
b = 0;
in_range = 1;
+ at_start = 1;
continue;
}
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
at_start = 0;
totaldigits++;
}
+ if (ndigits == totaldigits)
+ continue;
+ /* if no digit is after '-', it's wrong*/
+ if (at_start && in_range)
+ return -EINVAL;
if (!(a <= b))
return -EINVAL;
if (b >= nmaskbits)
return -ERANGE;
- if (!at_start) {
- while (a <= b) {
- set_bit(a, maskp);
- a++;
- }
+ while (a <= b) {
+ set_bit(a, maskp);
+ a++;
}
} while (buflen && c == ',');
return 0;
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 8f13cf73c2ec..c248c5894557 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -27,7 +27,7 @@ static int get_range(char **str, int *pint)
int x, inc_counter, upper_range;
(*str)++;
- upper_range = simple_strtol((*str), NULL, 0);
+ parse_integer(*str, 0, &upper_range);
inc_counter = upper_range - *pint;
for (x = *pint; x < upper_range; x++)
*pint++ = x;
@@ -51,13 +51,14 @@ static int get_range(char **str, int *pint)
int get_option(char **str, int *pint)
{
- char *cur = *str;
+ int len;
- if (!cur || !(*cur))
+ if (!str || !*str)
return 0;
- *pint = simple_strtol(cur, str, 0);
- if (cur == *str)
+ len = parse_integer(*str, 0, pint);
+ if (len < 0)
return 0;
+ *str += len;
if (**str == ',') {
(*str)++;
return 2;
@@ -126,38 +127,41 @@ EXPORT_SYMBOL(get_options);
unsigned long long memparse(const char *ptr, char **retptr)
{
- char *endptr; /* local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
+ unsigned long long val = 0;
+ int len;
+
+ len = parse_integer(ptr, 0, &val);
+ if (len < 0)
+ goto out;
+ ptr += len;
+ switch (*ptr) {
case 'E':
case 'e':
- ret <<= 10;
+ val <<= 10;
case 'P':
case 'p':
- ret <<= 10;
+ val <<= 10;
case 'T':
case 't':
- ret <<= 10;
+ val <<= 10;
case 'G':
case 'g':
- ret <<= 10;
+ val <<= 10;
case 'M':
case 'm':
- ret <<= 10;
+ val <<= 10;
case 'K':
case 'k':
- ret <<= 10;
- endptr++;
+ val <<= 10;
+ ptr++;
default:
break;
}
-
+out:
if (retptr)
- *retptr = endptr;
+ *retptr = (char *)ptr;
- return ret;
+ return val;
}
EXPORT_SYMBOL(memparse);
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
pool->min_alloc_order = min_alloc_order;
pool->algo = gen_pool_first_fit;
pool->data = NULL;
+ pool->name = NULL;
}
return pool;
}
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
kfree(chunk);
}
+ kfree_const(pool->name);
kfree(pool);
- return;
}
EXPORT_SYMBOL(gen_pool_destroy);
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res)
gen_pool_destroy(*(struct gen_pool **)res);
}
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+ struct gen_pool **p = res;
+
+ /* NULL data matches only a pool without an assigned name */
+ if (!data && !(*p)->name)
+ return 1;
+
+ if (!data || !(*p)->name)
+ return 0;
+
+ return !strcmp((*p)->name, data);
+}
+
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+ struct gen_pool **p;
+
+ p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+ (void *)name);
+ if (!p)
+ return NULL;
+ return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
+
/**
* devm_gen_pool_create - managed gen_pool_create
* @dev: device that provides the gen_pool
* @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
*
* Create a new special memory pool that can be used to manage special purpose
* memory not managed by the regular kmalloc/kfree interface. The pool will be
* automatically destroyed by the device management code.
*/
struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
- int nid)
+ int nid, const char *name)
{
struct gen_pool **ptr, *pool;
+ const char *pool_name = NULL;
+
+ /* Check that genpool to be created is uniquely addressed on device */
+ if (gen_pool_get(dev, name))
+ return ERR_PTR(-EINVAL);
+
+ if (name) {
+ pool_name = kstrdup_const(name, GFP_KERNEL);
+ if (!pool_name)
+ return ERR_PTR(-ENOMEM);
+ }
ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
if (!ptr)
- return NULL;
+ goto free_pool_name;
pool = gen_pool_create(min_alloc_order, nid);
- if (pool) {
- *ptr = pool;
- devres_add(dev, ptr);
- } else {
- devres_free(ptr);
- }
+ if (!pool)
+ goto free_devres;
+
+ *ptr = pool;
+ pool->name = pool_name;
+ devres_add(dev, ptr);
return pool;
-}
-EXPORT_SYMBOL(devm_gen_pool_create);
-/**
- * gen_pool_get - Obtain the gen_pool (if any) for a device
- * @dev: device to retrieve the gen_pool from
- *
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
- struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
- NULL);
+free_devres:
+ devres_free(ptr);
+free_pool_name:
+ kfree_const(pool_name);
- if (!p)
- return NULL;
- return *p;
+ return ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL_GPL(gen_pool_get);
+EXPORT_SYMBOL(devm_gen_pool_create);
#ifdef CONFIG_OF
/**
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
const char *propname, int index)
{
struct platform_device *pdev;
- struct device_node *np_pool;
+ struct device_node *np_pool, *parent;
+ const char *name = NULL;
+ struct gen_pool *pool = NULL;
np_pool = of_parse_phandle(np, propname, index);
if (!np_pool)
return NULL;
+
pdev = of_find_device_by_node(np_pool);
+ if (!pdev) {
+ /* Check if named gen_pool is created by parent node device */
+ parent = of_get_parent(np_pool);
+ pdev = of_find_device_by_node(parent);
+ of_node_put(parent);
+
+ of_property_read_string(np_pool, "label", &name);
+ if (!name)
+ name = np_pool->name;
+ }
+ if (pdev)
+ pool = gen_pool_get(&pdev->dev, name);
of_node_put(np_pool);
- if (!pdev)
- return NULL;
- return gen_pool_get(&pdev->dev);
+
+ return pool;
}
EXPORT_SYMBOL_GPL(of_gen_pool_get);
#endif /* CONFIG_OF */
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index df30632f0bef..ff19f66d3f7f 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -119,7 +119,7 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
unsigned long align_mask = 0;
if (align_order > 0)
- align_mask = 0xffffffffffffffffl >> (64 - align_order);
+ align_mask = ~0ul >> (BITS_PER_LONG - align_order);
/* Sanity check */
if (unlikely(npages == 0)) {
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index ec8da78df9be..1698b286d954 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -20,22 +20,6 @@
#include <asm/uaccess.h>
#include "kstrtox.h"
-const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
-{
- if (*base == 0) {
- if (s[0] == '0') {
- if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
- *base = 16;
- else
- *base = 8;
- } else
- *base = 10;
- }
- if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
- s += 2;
- return s;
-}
-
/*
* Convert non-negative integer string representation in explicitly given radix
* to an integer.
@@ -83,244 +67,6 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
return rv;
}
-static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
-{
- unsigned long long _res;
- unsigned int rv;
-
- s = _parse_integer_fixup_radix(s, &base);
- rv = _parse_integer(s, base, &_res);
- if (rv & KSTRTOX_OVERFLOW)
- return -ERANGE;
- if (rv == 0)
- return -EINVAL;
- s += rv;
- if (*s == '\n')
- s++;
- if (*s)
- return -EINVAL;
- *res = _res;
- return 0;
-}
-
-/**
- * kstrtoull - convert a string to an unsigned long long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
-{
- if (s[0] == '+')
- s++;
- return _kstrtoull(s, base, res);
-}
-EXPORT_SYMBOL(kstrtoull);
-
-/**
- * kstrtoll - convert a string to a long long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoll(const char *s, unsigned int base, long long *res)
-{
- unsigned long long tmp;
- int rv;
-
- if (s[0] == '-') {
- rv = _kstrtoull(s + 1, base, &tmp);
- if (rv < 0)
- return rv;
- if ((long long)(-tmp) >= 0)
- return -ERANGE;
- *res = -tmp;
- } else {
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if ((long long)tmp < 0)
- return -ERANGE;
- *res = tmp;
- }
- return 0;
-}
-EXPORT_SYMBOL(kstrtoll);
-
-/* Internal, do not use. */
-int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(unsigned long)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(_kstrtoul);
-
-/* Internal, do not use. */
-int _kstrtol(const char *s, unsigned int base, long *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(long)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(_kstrtol);
-
-/**
- * kstrtouint - convert a string to an unsigned int
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtouint(const char *s, unsigned int base, unsigned int *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(unsigned int)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtouint);
-
-/**
- * kstrtoint - convert a string to an int
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoint(const char *s, unsigned int base, int *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(int)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtoint);
-
-int kstrtou16(const char *s, unsigned int base, u16 *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(u16)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtou16);
-
-int kstrtos16(const char *s, unsigned int base, s16 *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(s16)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtos16);
-
-int kstrtou8(const char *s, unsigned int base, u8 *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(u8)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtou8);
-
-int kstrtos8(const char *s, unsigned int base, s8 *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(s8)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtos8);
-
#define kstrto_from_user(f, g, type) \
int f(const char __user *s, size_t count, unsigned int base, type *res) \
{ \
diff --git a/lib/kstrtox.h b/lib/kstrtox.h
index f13eeeaf441d..7b1f447cbcc1 100644
--- a/lib/kstrtox.h
+++ b/lib/kstrtox.h
@@ -2,7 +2,6 @@
#define _LIB_KSTRTOX_H
#define KSTRTOX_OVERFLOW (1U << 31)
-const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res);
#endif
diff --git a/lib/parse-integer.c b/lib/parse-integer.c
new file mode 100644
index 000000000000..7c7f48bec328
--- /dev/null
+++ b/lib/parse-integer.c
@@ -0,0 +1,222 @@
+/*
+ * See parse_integer().
+ *
+ * Individual dispatch functions in this file aren't supposed to be used
+ * directly and thus aren't advertised and documented despited being exported.
+ *
+ * Do not use any function in this file for any reason.
+ */
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/parse-integer.h>
+#include <asm/bug.h>
+
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+ if (*base == 0) {
+ if (s[0] == '0') {
+ if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+ *base = 16;
+ else
+ *base = 8;
+ } else
+ *base = 10;
+ }
+ if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+ s += 2;
+ BUG_ON(*base < 2 || *base > 16);
+ return s;
+}
+
+static int ___parse_integer(const char *s, unsigned int base, unsigned long long *val)
+{
+ const char *s0 = s, *sd;
+ unsigned long long acc;
+
+ s = sd = _parse_integer_fixup_radix(s0, &base);
+ acc = 0;
+ while (*s) {
+ unsigned int d;
+
+ if ('0' <= *s && *s <= '9')
+ d = *s - '0';
+ else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')
+ d = _tolower(*s) - 'a' + 10;
+ else
+ break;
+ if (d >= base)
+ break;
+ /* Overflow can't happen early enough. */
+ if ((acc >> 60) && acc > div_u64(ULLONG_MAX - d, base))
+ return -ERANGE;
+ acc = acc * base + d;
+ s++;
+ }
+ /* At least one digit has to be converted. */
+ if (s == sd)
+ return -EINVAL;
+ *val = acc;
+ /* Radix 1 is not supported otherwise returned length can overflow. */
+ return s - s0;
+}
+
+static int __parse_integer(const char *s, unsigned int base, unsigned long long *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = ___parse_integer(s, base & ~PARSE_INTEGER_NEWLINE, &tmp);
+ if (rv < 0)
+ return rv;
+ if (base & PARSE_INTEGER_NEWLINE) {
+ /* Accept "integer\0" or "integer\n\0" */
+ s += rv;
+ if (*s == '\n')
+ s++;
+ if (*s)
+ return -EINVAL;
+ }
+ *val = tmp;
+ return rv;
+}
+
+int _parse_integer_ull(const char *s, unsigned int base, unsigned long long *val)
+{
+ char sign;
+ int rv;
+
+ sign = 0;
+ if (*s == '-')
+ return -EINVAL;
+ else if (*s == '+')
+ sign = *s++;
+
+ rv = __parse_integer(s, base, val);
+ if (rv < 0)
+ return rv;
+ if (base & PARSE_INTEGER_NEWLINE)
+ return 0;
+ return rv + !!sign;
+}
+EXPORT_SYMBOL(_parse_integer_ull);
+
+int _parse_integer_ll(const char *s, unsigned int base, long long *val)
+{
+ unsigned long long tmp;
+ char sign;
+ int rv;
+
+ sign = 0;
+ if (*s == '-' || *s == '+')
+ sign = *s++;
+
+ rv = __parse_integer(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (sign == '-') {
+ if ((long long)-tmp > 0)
+ return -ERANGE;
+ *val = -tmp;
+ } else {
+ if ((long long)tmp < 0)
+ return -ERANGE;
+ *val = tmp;
+ }
+ if (base & PARSE_INTEGER_NEWLINE)
+ return 0;
+ return rv + !!sign;
+}
+EXPORT_SYMBOL(_parse_integer_ll);
+
+int _parse_integer_u(const char *s, unsigned int base, unsigned int *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned int)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_u);
+
+int _parse_integer_i(const char *s, unsigned int base, int *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (int)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_i);
+
+int _parse_integer_us(const char *s, unsigned int base, unsigned short *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned short)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_us);
+
+int _parse_integer_s(const char *s, unsigned int base, short *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (short)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_s);
+
+int _parse_integer_uc(const char *s, unsigned int base, unsigned char *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned char)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_uc);
+
+int _parse_integer_sc(const char *s, unsigned int base, signed char *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (signed char)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_sc);
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b..f00386777278 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -44,7 +44,7 @@ static int match_one(char *s, const char *p, substring_t args[])
p = meta + 1;
if (isdigit(*p))
- len = simple_strtoul(p, (char **) &p, 10);
+ p += parse_integer(p, 10, (unsigned int *)&len);
else if (*p == '%') {
if (*s++ != '%')
return 0;
@@ -57,6 +57,11 @@ static int match_one(char *s, const char *p, substring_t args[])
args[argc].from = s;
switch (*p++) {
+ union {
+ int i;
+ unsigned int u;
+ } u;
+
case 's': {
size_t str_len = strlen(s);
@@ -68,19 +73,20 @@ static int match_one(char *s, const char *p, substring_t args[])
break;
}
case 'd':
- simple_strtol(s, &args[argc].to, 0);
+ len = parse_integer(s, 0, &u.i);
goto num;
case 'u':
- simple_strtoul(s, &args[argc].to, 0);
+ len = parse_integer(s, 0, &u.u);
goto num;
case 'o':
- simple_strtoul(s, &args[argc].to, 8);
+ len = parse_integer(s, 8, &u.u);
goto num;
case 'x':
- simple_strtoul(s, &args[argc].to, 16);
+ len = parse_integer(s, 16, &u.u);
num:
- if (args[argc].to == args[argc].from)
+ if (len < 0)
return 0;
+ args[argc].to = args[argc].from + len;
break;
default:
return 0;
@@ -127,10 +133,8 @@ EXPORT_SYMBOL(match_token);
*/
static int match_number(substring_t *s, int *result, int base)
{
- char *endp;
char *buf;
int ret;
- long val;
size_t len = s->to - s->from;
buf = kmalloc(len + 1, GFP_KERNEL);
@@ -139,16 +143,11 @@ static int match_number(substring_t *s, int *result, int base)
memcpy(buf, s->from, len);
buf[len] = '\0';
- ret = 0;
- val = simple_strtol(buf, &endp, base);
- if (endp == buf)
- ret = -EINVAL;
- else if (val < (long)INT_MIN || val > (long)INT_MAX)
- ret = -ERANGE;
- else
- *result = (int) val;
+ ret = parse_integer(buf, base, result);
kfree(buf);
- return ret;
+ if (ret < 0)
+ return ret;
+ return 0;
}
/**
diff --git a/lib/show_mem.c b/lib/show_mem.c
index adc98e1825ba..1feed6a2b12a 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -38,11 +38,9 @@ void show_mem(unsigned int filter)
printk("%lu pages RAM\n", total);
printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMA
- printk("%lu pages reserved\n", (reserved - totalcma_pages));
printk("%lu pages cma reserved\n", totalcma_pages);
-#else
- printk("%lu pages reserved\n", reserved);
#endif
#ifdef CONFIG_QUICKLIST
printk("%lu pages in pagetable cache\n",
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 76f29ecba8f4..caabc7151b90 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -100,7 +100,7 @@ static int __init
setup_io_tlb_npages(char *str)
{
if (isdigit(*str)) {
- io_tlb_nslabs = simple_strtoul(str, &str, 0);
+ str += parse_integer(str, 0, &io_tlb_nslabs);
/* avoid tail segment of size < IO_TLB_SEGSIZE */
io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
}
diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c
index 4137bca5f8e8..f355f67169b6 100644
--- a/lib/test-kstrtox.c
+++ b/lib/test-kstrtox.c
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
{"4294967297", 10, 4294967297LL},
{"9223372036854775807", 10, 9223372036854775807LL},
+ {"-0", 10, 0LL},
{"-1", 10, -1LL},
{"-2", 10, -2LL},
{"-9223372036854775808", 10, LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
{"-9223372036854775809", 10},
{"-18446744073709551614", 10},
{"-18446744073709551615", 10},
- /* negative zero isn't an integer in Linux */
- {"-0", 0},
- {"-0", 8},
- {"-0", 10},
- {"-0", 16},
/* sign is first character if any */
{"-+1", 0},
{"-+1", 8},
diff --git a/lib/test-parse-integer.c b/lib/test-parse-integer.c
new file mode 100644
index 000000000000..4274603f4d1a
--- /dev/null
+++ b/lib/test-parse-integer.c
@@ -0,0 +1,563 @@
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/parse-integer.h>
+#include <asm/bug.h>
+
+#define for_each_test(i, test) \
+ for (i = 0; i < ARRAY_SIZE(test); i++)
+
+#define DEFINE_TEST_OK(type, test_type, test) \
+test_type { \
+ const char *str; \
+ unsigned int base; \
+ int expected_rv; \
+ type expected_val; \
+}; \
+static const test_type test[] __initconst =
+
+#define TEST_OK(type, fmt, test) \
+{ \
+ unsigned int i; \
+ \
+ for_each_test(i, test) { \
+ const typeof(test[0]) *t = &test[i]; \
+ type val; \
+ int rv; \
+ \
+ rv = parse_integer(t->str, t->base, &val); \
+ if (rv != t->expected_rv || val != t->expected_val) { \
+ WARN(1, "str '%s', base %u, expected %d/"fmt", got %d/"fmt"\n", \
+ t->str, t->base, t->expected_rv, t->expected_val, rv, val); \
+ } \
+ } \
+}
+
+struct test_fail {
+ const char *str;
+ unsigned int base;
+};
+
+#define DEFINE_TEST_FAIL(type, test) \
+static const struct test_fail test[] __initconst =
+
+#define TEST_FAIL(type, fmt, test) \
+{ \
+ unsigned int i; \
+ \
+ for_each_test(i, test) { \
+ const typeof(test[0]) *t = &test[i]; \
+ type val; \
+ int rv; \
+ \
+ val = 113; \
+ rv = parse_integer(t->str, t->base, &val); \
+ if (rv >= 0 || val != 113) { \
+ WARN(1, "str '%s', base %u, expected -E, got %d/"fmt"\n",\
+ t->str, t->base, rv, val); \
+ } \
+ } \
+}
+
+DEFINE_TEST_OK(unsigned long long, struct test_ull, test_ull_ok)
+{
+ {"0", 10, 1, 0},
+ {"1", 10, 1, 1},
+ {"2", 10, 1, 2},
+ {"3", 10, 1, 3},
+ {"4", 10, 1, 4},
+ {"5", 10, 1, 5},
+ {"6", 10, 1, 6},
+ {"7", 10, 1, 7},
+ {"8", 10, 1, 8},
+ {"9", 10, 1, 9},
+
+ {"0", 8, 1, 0},
+ {"1", 8, 1, 1},
+ {"2", 8, 1, 2},
+ {"3", 8, 1, 3},
+ {"4", 8, 1, 4},
+ {"5", 8, 1, 5},
+ {"6", 8, 1, 6},
+ {"7", 8, 1, 7},
+
+ {"0", 16, 1, 0},
+ {"1", 16, 1, 1},
+ {"2", 16, 1, 2},
+ {"3", 16, 1, 3},
+ {"4", 16, 1, 4},
+ {"5", 16, 1, 5},
+ {"6", 16, 1, 6},
+ {"7", 16, 1, 7},
+ {"8", 16, 1, 8},
+ {"9", 16, 1, 9},
+ {"a", 16, 1, 10},
+ {"b", 16, 1, 11},
+ {"c", 16, 1, 12},
+ {"d", 16, 1, 13},
+ {"e", 16, 1, 14},
+ {"f", 16, 1, 15},
+ {"A", 16, 1, 10},
+ {"B", 16, 1, 11},
+ {"C", 16, 1, 12},
+ {"D", 16, 1, 13},
+ {"E", 16, 1, 14},
+ {"F", 16, 1, 15},
+
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+ {"65536", 10, 5, 65536},
+ {"2147483647", 10, 10, 2147483647},
+ {"2147483648", 10, 10, 2147483648ull},
+ {"4294967295", 10, 10, 4294967295ull},
+ {"4294967296", 10, 10, 4294967296},
+ {"9223372036854775807", 10, 19, 9223372036854775807},
+ {"9223372036854775808", 10, 19, 9223372036854775808ull},
+ {"18446744073709551615", 10, 20, 18446744073709551615ull},
+
+ {"177", 8, 3, 0177},
+ {"200", 8, 3, 0200},
+ {"377", 8, 3, 0377},
+ {"400", 8, 3, 0400},
+ {"77777", 8, 5, 077777},
+ {"100000", 8, 6, 0100000},
+ {"177777", 8, 6, 0177777},
+ {"200000", 8, 6, 0200000},
+ {"17777777777", 8, 11, 017777777777},
+ {"20000000000", 8, 11, 020000000000},
+ {"37777777777", 8, 11, 037777777777},
+ {"40000000000", 8, 11, 040000000000},
+ {"777777777777777777777", 8, 21, 0777777777777777777777},
+ {"1000000000000000000000", 8, 22, 01000000000000000000000},
+ {"1777777777777777777777", 8, 22, 01777777777777777777777},
+
+ {"7f", 16, 2, 0x7f},
+ {"80", 16, 2, 0x80},
+ {"ff", 16, 2, 0xff},
+ {"100", 16, 3, 0x100},
+ {"7fff", 16, 4, 0x7fff},
+ {"8000", 16, 4, 0x8000},
+ {"ffff", 16, 4, 0xffff},
+ {"10000", 16, 5, 0x10000},
+ {"7fffffff", 16, 8, 0x7fffffff},
+ {"80000000", 16, 8, 0x80000000},
+ {"ffffffff", 16, 8, 0xffffffff},
+ {"100000000", 16, 9, 0x100000000},
+ {"7fffffffffffffff", 16, 16, 0x7fffffffffffffff},
+ {"8000000000000000", 16, 16, 0x8000000000000000},
+ {"ffffffffffffffff", 16, 16, 0xffffffffffffffff},
+ /* test sign */
+ {"+0", 10, 2, 0},
+ {"+42", 10, 3, 42},
+ /* test termination */
+ {"42/", 10, 2, 42},
+ {"42:", 10, 2, 42},
+ {"42/", 8, 2, 042},
+ {"428", 8, 2, 042},
+ {"42/", 16, 2, 0x42},
+ {"42`", 16, 2, 0x42},
+ {"42g", 16, 2, 0x42},
+ {"42@", 16, 2, 0x42},
+ {"42G", 16, 2, 0x42},
+ /* base autodetection */
+ {"010", 0, 3, 8},
+ {"0x10", 0, 4, 16},
+ {"0X10", 0, 4, 16},
+};
+
+static void __init test_parse_integer_ull_ok(void)
+{
+ TEST_OK(unsigned long long, "%llu", test_ull_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned long long, test_ull_fail)
+{
+ /* type overflow */
+ {"10000000000000000000000000000000000000000000000000000000000000000", 2},
+ {"18446744073709551616", 10},
+ {"2000000000000000000000", 8},
+ {"10000000000000000", 16},
+
+ {"", 0},
+ {"", 10},
+ {"", 8},
+ {"", 16},
+ {"+", 0},
+ {"+", 10},
+ {"+", 8},
+ {"+", 16},
+ {"-", 0},
+ {"-", 10},
+ {"-", 8},
+ {"-", 16},
+ {" ", 0},
+ {" ", 10},
+ {" ", 8},
+ {" ", 16},
+ {"\n", 0},
+ {"\n", 10},
+ {"\n", 8},
+ {"\n", 16},
+ {" 0", 0},
+ {" 0", 10},
+ {" 0", 8},
+ {" 0", 16},
+ {"\n0", 0},
+ {"\n0", 10},
+ {"\n0", 8},
+ {"\n0", 16},
+ /* non-digit */
+ {"/", 10},
+ {":", 10},
+ {"/", 8},
+ {"8", 8},
+ {"/", 16},
+ {":", 16},
+ {"`", 16},
+ {"g", 16},
+ {"@", 16},
+ {"G", 16},
+ {"/0", 10},
+ {":0", 10},
+ {"/0", 8},
+ {"80", 8},
+ {"/0", 16},
+ {":0", 16},
+ {"`0", 16},
+ {"g0", 16},
+ {"@0", 16},
+ {"G0", 16},
+
+ {"-0", 0},
+ {"-0", 10},
+ {"-0", 8},
+ {"-0", 16},
+ {"-1", 0},
+ {"-1", 10},
+ {"-1", 8},
+ {"-1", 16},
+ /* accept only one sign */
+ {"--", 0},
+ {"--", 10},
+ {"--", 8},
+ {"--", 16},
+ {"-+", 0},
+ {"-+", 10},
+ {"-+", 8},
+ {"-+", 16},
+ {"+-", 0},
+ {"+-", 10},
+ {"+-", 8},
+ {"+-", 16},
+ {"++", 0},
+ {"++", 10},
+ {"++", 8},
+ {"++", 16},
+ {"--0", 0},
+ {"--0", 10},
+ {"--0", 8},
+ {"--0", 16},
+ {"-+0", 0},
+ {"-+0", 10},
+ {"-+0", 8},
+ {"-+0", 16},
+ {"+-0", 0},
+ {"+-0", 10},
+ {"+-0", 8},
+ {"+-0", 16},
+ {"++0", 0},
+ {"++0", 10},
+ {"++0", 8},
+ {"++0", 16},
+};
+
+static void __init test_parse_integer_ull_fail(void)
+{
+ TEST_FAIL(unsigned long long, "%llu", test_ull_fail);
+}
+
+DEFINE_TEST_OK(long long, struct test_ll, test_ll_ok)
+{
+ {"-9223372036854775808",10, 20, LLONG_MIN},
+ {"-4294967296", 10, 11, -4294967296},
+ {"-2147483648", 10, 11, -2147483648ll},
+ {"-65536", 10, 6, -65536},
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+ {"65535", 10, 5, 65535},
+ {"2147483647", 10, 10, 2147483647},
+ {"4294967295", 10, 10, 4294967295ll},
+ {"9223372036854775807", 10, 19, 9223372036854775807},
+};
+
+static void __init test_parse_integer_ll_ok(void)
+{
+ TEST_OK(long long, "%lld", test_ll_ok);
+}
+
+DEFINE_TEST_FAIL(long long, test_ll_fail)
+{
+ {"-9223372036854775809", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_ll_fail(void)
+{
+ TEST_FAIL(long long, "%lld", test_ll_fail);
+}
+
+DEFINE_TEST_OK(unsigned int, struct test_u, test_u_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+ {"65536", 10, 5, 65536},
+ {"2147483647", 10, 10, 2147483647},
+ {"2147483648", 10, 10, 2147483648u},
+ {"4294967295", 10, 10, 4294967295u},
+};
+
+static void __init test_parse_integer_u_ok(void)
+{
+ TEST_OK(unsigned int, "%u", test_u_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned int, test_u_fail)
+{
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_u_fail(void)
+{
+ TEST_FAIL(unsigned int, "%u", test_u_fail);
+}
+
+DEFINE_TEST_OK(int, struct test_i, test_i_ok)
+{
+ {"-2147483648", 10, 11, INT_MIN},
+ {"-65536", 10, 6, -65536},
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+ {"65535", 10, 5, 65535},
+ {"2147483647", 10, 10, 2147483647},
+};
+
+static void __init test_parse_integer_i_ok(void)
+{
+ TEST_OK(int, "%d", test_i_ok);
+}
+
+DEFINE_TEST_FAIL(int, test_i_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_i_fail(void)
+{
+ TEST_FAIL(int, "%d", test_i_fail);
+}
+
+DEFINE_TEST_OK(unsigned short, struct test_us, test_us_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+};
+
+static void __init test_parse_integer_us_ok(void)
+{
+ TEST_OK(unsigned short, "%hu", test_us_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned short, test_us_fail)
+{
+ {"65536", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_us_fail(void)
+{
+ TEST_FAIL(unsigned short, "%hu", test_us_fail);
+}
+
+DEFINE_TEST_OK(short, struct test_s, test_s_ok)
+{
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+};
+
+static void __init test_parse_integer_s_ok(void)
+{
+ TEST_OK(short, "%hd", test_s_ok);
+}
+
+DEFINE_TEST_FAIL(short, test_s_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"-2147483648", 10},
+ {"-65536", 10},
+ {"-32769", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_s_fail(void)
+{
+ TEST_FAIL(short, "%hd", test_s_fail);
+}
+
+DEFINE_TEST_OK(unsigned char, struct test_uc, test_uc_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+};
+
+static void __init test_parse_integer_uc_ok(void)
+{
+ TEST_OK(unsigned char, "%hhu", test_uc_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned char, test_uc_fail)
+{
+ {"256", 10},
+ {"32767", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"65536", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_uc_fail(void)
+{
+ TEST_FAIL(unsigned char, "%hhu", test_uc_fail);
+}
+
+DEFINE_TEST_OK(signed char, struct test_sc, test_sc_ok)
+{
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+};
+
+static void __init test_parse_integer_sc_ok(void)
+{
+ TEST_OK(signed char, "%hhd", test_sc_ok);
+}
+
+DEFINE_TEST_FAIL(signed char, test_sc_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"-2147483648", 10},
+ {"-65536", 10},
+ {"-32769", 10},
+ {"-32768", 10},
+ {"-256", 10},
+ {"-129", 10},
+ {"128", 10},
+ {"255", 10},
+ {"32767", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_sc_fail(void)
+{
+ TEST_FAIL(signed char, "%hhd", test_sc_fail);
+}
+
+static int __init test_parse_integer_init(void)
+{
+ test_parse_integer_ull_ok();
+ test_parse_integer_ull_fail();
+ test_parse_integer_ll_ok();
+ test_parse_integer_ll_fail();
+ test_parse_integer_u_ok();
+ test_parse_integer_u_fail();
+ test_parse_integer_i_ok();
+ test_parse_integer_i_fail();
+ test_parse_integer_us_ok();
+ test_parse_integer_us_fail();
+ test_parse_integer_s_ok();
+ test_parse_integer_s_fail();
+ test_parse_integer_uc_ok();
+ test_parse_integer_uc_fail();
+ test_parse_integer_sc_ok();
+ test_parse_integer_sc_fail();
+ return -EINVAL;
+}
+module_init(test_parse_integer_init);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 95cd63b43b99..7f0cdd2e609f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1361,6 +1361,21 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
}
}
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1448,6 +1463,7 @@ int kptr_restrict __read_mostly;
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1459,7 +1475,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1598,6 +1614,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
@@ -2471,8 +2489,6 @@ EXPORT_SYMBOL_GPL(bprintf);
int vsscanf(const char *buf, const char *fmt, va_list args)
{
const char *str = buf;
- char *next;
- char digit;
int num = 0;
u8 qualifier;
unsigned int base;
@@ -2484,6 +2500,8 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
bool is_sign;
while (*fmt) {
+ int len;
+
/* skip any white space in format */
/* white space in format matchs any amount of
* white space, including none, in the input.
@@ -2612,81 +2630,88 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
*/
str = skip_spaces(str);
- digit = *str;
- if (is_sign && digit == '-')
- digit = *(str + 1);
-
- if (!digit
- || (base == 16 && !isxdigit(digit))
- || (base == 10 && !isdigit(digit))
- || (base == 8 && (!isdigit(digit) || digit > '7'))
- || (base == 0 && !isdigit(digit)))
- break;
-
if (is_sign)
- val.s = qualifier != 'L' ?
- simple_strtol(str, &next, base) :
- simple_strtoll(str, &next, base);
+ len = parse_integer(str, base, &val.s);
else
- val.u = qualifier != 'L' ?
- simple_strtoul(str, &next, base) :
- simple_strtoull(str, &next, base);
+ len = parse_integer(str, base, &val.u);
+ if (len < 0)
+ break;
- if (field_width > 0 && next - str > field_width) {
+ if (field_width > 0) {
if (base == 0)
_parse_integer_fixup_radix(str, &base);
- while (next - str > field_width) {
+ while (len > field_width) {
if (is_sign)
val.s = div_s64(val.s, base);
else
val.u = div_u64(val.u, base);
- --next;
+ len--;
}
}
switch (qualifier) {
case 'H': /* that's 'hh' in format */
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (signed char)val.s)
+ goto out;
*va_arg(args, signed char *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned char)val.u)
+ goto out;
*va_arg(args, unsigned char *) = val.u;
+ }
break;
case 'h':
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (short)val.s)
+ goto out;
*va_arg(args, short *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned short)val.u)
+ goto out;
*va_arg(args, unsigned short *) = val.u;
+ }
break;
case 'l':
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (long)val.s)
+ goto out;
*va_arg(args, long *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned long)val.u)
+ goto out;
*va_arg(args, unsigned long *) = val.u;
+ }
break;
case 'L':
- if (is_sign)
+ if (is_sign) {
*va_arg(args, long long *) = val.s;
- else
+ } else {
*va_arg(args, unsigned long long *) = val.u;
+ }
break;
case 'Z':
case 'z':
+ if (val.u != (size_t)val.u)
+ goto out;
*va_arg(args, size_t *) = val.u;
break;
default:
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (int)val.s)
+ goto out;
*va_arg(args, int *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned int)val.u)
+ goto out;
*va_arg(args, unsigned int *) = val.u;
+ }
break;
}
num++;
-
- if (!next)
- break;
- str = next;
+ str += len;
}
-
+out:
return num;
}
EXPORT_SYMBOL(vsscanf);
diff --git a/mm/Kconfig b/mm/Kconfig
index d4e6495a720f..7e9ccb438985 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,3 +648,15 @@ config DEFERRED_STRUCT_PAGE_INIT
when kswapd starts. This has a potential performance impact on
processes running early in the lifetime of the systemm until kswapd
finishes the initialisation.
+
+config IDLE_PAGE_TRACKING
+ bool "Enable idle page tracking"
+ select PROC_PAGE_MONITOR
+ select PAGE_EXTENSION if !64BIT
+ help
+ This feature allows to estimate the amount of user pages that have
+ not been touched during a given period of time. This information can
+ be useful to tune memory cgroup limits and/or for job placement
+ within a compute cluster.
+
+ See Documentation/vm/pagemap.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..b424d5e5b6ff 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2..16e1b5793452 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -732,18 +732,21 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* splitting and collapsing (collapsing has already happened
* if PageLRU is set) but the lock is not necessarily taken
* here and it is wasteful to take it just to check transhuge.
- * Check TransHuge without lock and skip the whole pageblock if
- * it's either a transhuge or hugetlbfs page, as calling
- * compound_order() without preventing THP from splitting the
- * page underneath us may return surprising results.
+ * Check PageCompound without lock and skip the whole pageblock
+ * if it's a transhuge page, as calling compound_order()
+ * without preventing THP from splitting the page underneath us
+ * may return surprising results.
+ * If we happen to check a THP tail page, compound_order()
+ * returns 0. It should be rare enough to not bother with
+ * using compound_head() in that case.
*/
- if (PageTransHuge(page)) {
- if (!locked)
- low_pfn = ALIGN(low_pfn + 1,
- pageblock_nr_pages) - 1;
+ if (PageCompound(page)) {
+ int nr;
+ if (locked)
+ nr = 1 << compound_order(page);
else
- low_pfn += (1 << compound_order(page)) - 1;
-
+ nr = pageblock_nr_pages;
+ low_pfn += nr - 1;
continue;
}
@@ -763,11 +766,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!locked)
break;
- /* Recheck PageLRU and PageTransHuge under lock */
+ /* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
+ if (PageCompound(page)) {
+ int nr = 1 << compound_order(page);
+ low_pfn += nr - 1;
continue;
}
}
@@ -778,7 +782,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
diff --git a/mm/debug.c b/mm/debug.c
index 76089ddf99ea..e784110fb51d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{1UL << PG_compound_lock, "compound_lock" },
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+ {1UL << PG_young, "young" },
+ {1UL << PG_idle, "idle" },
+#endif
};
static void dump_flags(unsigned long flags,
@@ -121,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
+ {VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fd5fe4342e93..4b657099111f 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
return page;
}
-static inline int is_page_busy(struct dma_page *page)
+static inline bool is_page_busy(struct dma_page *page)
{
return page->in_use != 0;
}
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
{
bool empty = false;
+ if (unlikely(!pool))
+ return;
+
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc825458..204fd1c7c813 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -641,11 +641,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -768,6 +768,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -832,18 +833,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -2473,21 +2476,6 @@ ssize_t generic_perform_write(struct file *file,
iov_iter_count(i));
again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
-
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
@@ -2495,8 +2483,17 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
-
+ /*
+ * 'page' is now locked. If we are trying to copy from a
+ * mapping of 'page' in userspace, the copy might fault and
+ * would need PageUptodate() to complete. But, page can not be
+ * made Uptodate without acquiring the page lock, which we hold.
+ * Deadlock. Avoid with pagefault_disable(). Fix up below with
+ * iov_iter_fault_in_readable().
+ */
+ pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2516,14 @@ again:
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
+ /*
+ * This is the fallback to recover if the copy from
+ * userspace above faults.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
goto again;
}
pos += copied;
diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..63b72f939981 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
#include <linux/sched.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+
#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
#include "internal.h"
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
return NULL;
}
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, unsigned int flags)
+{
+ /* No page to get reference */
+ if (flags & FOLL_GET)
+ return -EFAULT;
+
+ if (flags & FOLL_TOUCH) {
+ pte_t entry = *pte;
+
+ if (flags & FOLL_WRITE)
+ entry = pte_mkdirty(entry);
+ entry = pte_mkyoung(entry);
+
+ if (!pte_same(*pte, entry)) {
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ update_mmu_cache(vma, address, pte);
+ }
+ }
+
+ /* Proper page table entry exists, but no corresponding struct page */
+ return -EEXIST;
+}
+
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
@@ -73,10 +99,21 @@ retry:
page = vm_normal_page(vma, address, pte);
if (unlikely(!page)) {
- if ((flags & FOLL_DUMP) ||
- !is_zero_pfn(pte_pfn(pte)))
- goto bad_page;
- page = pte_page(pte);
+ if (flags & FOLL_DUMP) {
+ /* Avoid special (like zero) pages in core dumps */
+ page = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte))) {
+ page = pte_page(pte);
+ } else {
+ int ret;
+
+ ret = follow_pfn_pte(vma, address, ptep, flags);
+ page = ERR_PTR(ret);
+ goto out;
+ }
}
if (flags & FOLL_GET)
@@ -92,7 +129,8 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) &&
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -114,12 +152,9 @@ retry:
unlock_page(page);
}
}
+out:
pte_unmap_unlock(ptep, ptl);
return page;
-bad_page:
- pte_unmap_unlock(ptep, ptl);
- return ERR_PTR(-EFAULT);
-
no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
@@ -489,9 +524,15 @@ retry:
goto next_page;
}
BUG();
- }
- if (IS_ERR(page))
+ } else if (PTR_ERR(page) == -EEXIST) {
+ /*
+ * Proper page table entry exists, but no corresponding
+ * struct page.
+ */
+ goto next_page;
+ } else if (IS_ERR(page)) {
return i ? i : PTR_ERR(page);
+ }
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
@@ -818,6 +859,30 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
+/*
+ * Helper function used by both populate_vma_page_range() and pin_user_pages
+ */
+static int get_gup_flags(vm_flags_t vm_flags)
+{
+ int gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ return gup_flags;
+}
+
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
@@ -850,21 +915,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
- gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
- gup_flags |= FOLL_FORCE;
+ gup_flags = get_gup_flags(vma->vm_flags);
/*
* We made sure addr is within a VMA, so the following will
@@ -874,6 +925,139 @@ long populate_vma_page_range(struct vm_area_struct *vma,
NULL, NULL, nonblocking);
}
+static long pin_user_pages(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+ long i = 0;
+ unsigned int page_mask;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ if (!nr_pages)
+ return 0;
+
+ gup_flags = get_gup_flags(vma->vm_flags);
+
+ /*
+ * If FOLL_FORCE is set then do not force a full fault as the hinting
+ * fault information is unrelated to the reference behaviour of a task
+ * using the address space
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
+ vma = NULL;
+
+ do {
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ int ret;
+
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma, NULL);
+ if (ret)
+ return i ? : ret;
+ page_mask = 0;
+ goto next_page;
+ }
+
+ if (!vma)
+ return i ? : -EFAULT;
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, NULL, NULL,
+ &start, &nr_pages, i,
+ gup_flags);
+ continue;
+ }
+ }
+
+ /*
+ * If we have a pending SIGKILL, don't keep pinning pages
+ */
+ if (unlikely(fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
+ cond_resched();
+ page = follow_page_mask(vma, start, foll_flags, &page_mask);
+ if (!page)
+ goto next_page;
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+next_page:
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
+ } while (nr_pages);
+ return i;
+}
+
+/*
+ * mm_lock_present - lock present pages within a range of address space.
+ *
+ * This is used to implement mlock2(MLOCK_LOCKONFAULT). VMAs must be already
+ * marked with the desired vm_flags, and mmap_sem must not be held.
+ */
+int mm_lock_present(unsigned long start, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+
+ ret = pin_user_pages(vma, nstart, nend, &locked);
+ if (ret < 0)
+ break;
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094f79ba..1221fbdc9b14 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -23,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -104,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
};
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
{
struct zone *zone;
int nr_zones = 0;
@@ -139,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
setup_per_zone_wmarks();
- return 0;
}
static int start_stop_khugepaged(void)
@@ -171,12 +172,7 @@ fail:
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
@@ -716,21 +712,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
- struct page *page, gfp_t gfp)
+ unsigned long address, pmd_t *pmd,
+ struct page *page, gfp_t gfp,
+ unsigned int flags)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
spinlock_t *ptl;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
- return VM_FAULT_OOM;
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
return VM_FAULT_OOM;
}
@@ -750,6 +752,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
+
+ /* Deliver the page fault to userland */
+ if (userfaultfd_missing(vma)) {
+ int ret;
+
+ spin_unlock(ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
+ pte_free(mm, pgtable);
+ ret = handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
+ }
+
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +777,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes);
spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
}
return 0;
@@ -771,7 +789,7 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
@@ -806,6 +824,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ int ret;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -816,14 +835,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ ret = 0;
+ set = false;
+ if (pmd_none(*pmd)) {
+ if (userfaultfd_missing(vma)) {
+ spin_unlock(ptl);
+ ret = handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ } else {
+ set_huge_zero_page(pgtable, mm, vma,
+ haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ set = true;
+ }
+ } else
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
- return 0;
+ return ret;
}
gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +864,51 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
+ return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
+ flags);
+}
+
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t entry;
+ spinlock_t *ptl;
+
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_none(*pmd)) {
+ entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
}
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
+}
- count_vm_event(THP_FAULT_ALLOC);
- return 0;
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * If we had pmd_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return VM_FAULT_SIGBUS;
+ return insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +943,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_huge_zero_pmd(pmd)) {
struct page *zero_page;
- bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
zero_page = get_huge_zero_page();
- set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
- BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
}
@@ -1238,7 +1306,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) &&
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -1384,6 +1453,36 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pmd_t orig_pmd;
+
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ spin_unlock(ptl);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1391,7 +1490,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
int ret = 0;
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
/*
@@ -1403,13 +1501,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ if (vma_is_dax(vma)) {
+ if (is_huge_zero_pmd(orig_pmd)) {
+ pgtable = NULL;
+ } else {
+ spin_unlock(ptl);
+ return 1;
+ }
+ } else {
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ }
if (is_huge_zero_pmd(orig_pmd)) {
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
put_huge_zero_page();
} else {
- page = pmd_page(orig_pmd);
+ struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1418,7 +1525,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
- pte_free(tlb->mm, pgtable);
+ if (pgtable)
+ pte_free(tlb->mm, pgtable);
ret = 1;
}
return ret;
@@ -1599,6 +1707,11 @@ unlock:
return NULL;
}
+int pmd_freeable(pmd_t pmd)
+{
+ return !pmd_dirty(pmd);
+}
+
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
@@ -1676,12 +1789,7 @@ static void __split_huge_page_refcount(struct page *page,
/* after clearing PageTail the gup refcount can be released */
smp_mb__after_atomic();
- /*
- * retain hwpoison flag of the poisoned tail page:
- * fix for the unsuitable process killed on Guest Machine(KVM)
- * by the memory-failure.
- */
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
@@ -1694,6 +1802,11 @@ static void __split_huge_page_refcount(struct page *page,
/* clear PageTail before overwriting first_page */
smp_wmb();
+ if (page_is_young(page))
+ set_page_young(page_tail);
+ if (page_is_idle(page))
+ set_page_idle(page_tail);
+
/*
* __split_huge_page_splitting() already set the
* splitting bit in all pmd that could map this
@@ -1710,7 +1823,7 @@ static void __split_huge_page_refcount(struct page *page,
*/
page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
+ BUG_ON(page_tail->mapping != TAIL_MAPPING);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
@@ -2138,7 +2251,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2198,7 +2312,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageLRU(page), page);
/* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval) || PageReferenced(page) ||
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
@@ -2591,7 +2706,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
@@ -2624,7 +2740,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
*/
if (page_count(page) != 1 + !!PageSwapCache(page))
goto out_unmap;
- if (pte_young(pteval) || PageReferenced(page) ||
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
@@ -2887,7 +3004,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2900,25 +3017,25 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
- }
- if (is_huge_zero_pmd(*pmd)) {
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto unlock;
+ if (vma_is_dax(vma)) {
+ pmdp_huge_clear_flush(vma, haddr, pmd);
+ } else if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
+ } else {
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ get_page(page);
}
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ unlock:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- split_huge_page(page);
+ if (!page)
+ return;
+ split_huge_page(page);
put_page(page);
/*
@@ -2967,7 +3084,7 @@ static void split_huge_page_address(struct mm_struct *mm,
split_huge_page_pmd_mm(mm, address, pmd);
}
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c3087089d8..098173f67aa9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the
- * specified range. We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range. If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map. In the normal case, existing regions will be expanded
+ * to accommodate the specified range. Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range. However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded. In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
if (f <= rg->to)
break;
+ /*
+ * If no region exists which can be expanded to include the
+ * specified range, the list must have been modified by an
+ * interleving call to region_del(). Pull a region descriptor
+ * from the cache and use it for this range.
+ */
+ if (&rg->link == head || t < rg->from) {
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region,
+ link);
+ list_del(&nrg->link);
+
+ nrg->from = f;
+ nrg->to = t;
+ list_add(&nrg->link, rg->link.prev);
+
+ add += t - f;
+ goto out_locked;
+ }
+
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+out_locked:
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
* so that the subsequent region_add call will have all the
* regions it needs and will not fail.
*
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero. -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map. If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t). This number is greater or equal to
+ * zero. -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
retry:
spin_lock(&resv->lock);
+retry_locked:
+ resv->adds_in_progress++;
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations.
+ */
+ if (resv->adds_in_progress > resv->region_cache_count) {
+ struct file_region *trg;
+
+ VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+ /* Must drop lock to allocate a new descriptor. */
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ return -ENOMEM;
+
+ spin_lock(&resv->lock);
+ list_add(&trg->link, &resv->region_cache);
+ resv->region_cache_count++;
+ goto retry_locked;
+ }
+
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
* size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) {
if (!nrg) {
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
}
/*
- * Truncate the reserve map at index 'end'. Modify/truncate any
- * region which contains end. Delete any regions past end.
- * Return the number of huge pages removed from the map.
+ * Abort the in progress add operation. The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add. Operations are sometimes
+ * aborted after the call to region_chg. In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine. They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
+ */
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+ spin_lock(&resv->lock);
+ VM_BUG_ON(!resv->region_cache_count);
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map. If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted. Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more. In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated. If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM. Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
*/
-static long region_truncate(struct resv_map *resv, long end)
+static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
- long chg = 0;
+ struct file_region *nrg = NULL;
+ long del = 0;
+retry:
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
+ list_for_each_entry_safe(rg, trg, head, link) {
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
break;
- if (&rg->link == head)
- goto out;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
- }
+ if (f > rg->from && t < rg->to) { /* Must split region */
+ /*
+ * Check for an entry in the cache before dropping
+ * lock and attempting allocation.
+ */
+ if (!nrg &&
+ resv->region_cache_count > resv->adds_in_progress) {
+ nrg = list_first_entry(&resv->region_cache,
+ struct file_region,
+ link);
+ list_del(&nrg->link);
+ resv->region_cache_count--;
+ }
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ del += t - f;
+
+ /* New entry for end of split region */
+ nrg->from = t;
+ nrg->to = rg->to;
+ INIT_LIST_HEAD(&nrg->link);
+
+ /* Original entry is trimmed */
+ rg->to = f;
+
+ list_add(&nrg->link, &rg->link);
+ nrg = NULL;
break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
+ }
+
+ if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+ del += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ continue;
+ }
+
+ if (f <= rg->from) { /* Trim beginning of region */
+ del += t - rg->from;
+ rg->from = t;
+ } else { /* Trim end of region */
+ del += rg->to - f;
+ rg->to = f;
+ }
}
-out:
spin_unlock(&resv->lock);
- return chg;
+ kfree(nrg);
+ return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page. The huge page itself was free'ed
+ * and removed from the page cache. This routine will adjust the subpool
+ * usage count, and the global reserve count if needed. By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ if (restore_reserve && rsv_adjust) {
+ struct hstate *h = hstate_inode(inode);
+
+ hugetlb_acct_memory(h, 1);
+ }
}
/*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
- if (!resv_map)
+ struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+ if (!resv_map || !rg) {
+ kfree(resv_map);
+ kfree(rg);
return NULL;
+ }
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ resv_map->adds_in_progress = 0;
+
+ INIT_LIST_HEAD(&resv_map->region_cache);
+ list_add(&rg->link, &resv_map->region_cache);
+ resv_map->region_cache_count = 1;
+
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+ struct list_head *head = &resv_map->region_cache;
+ struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
- region_truncate(resv_map, 0);
+ region_del(resv_map, 0, LONG_MAX);
+
+ /* ... and any entries left in the cache */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+ VM_BUG_ON(resv_map->adds_in_progress);
+
kfree(resv_map);
}
@@ -616,7 +782,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
if (vma->vm_flags & VM_NORESERVE) {
/*
@@ -629,23 +795,34 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
* properly, so add work-around here.
*/
if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return 1;
+ return true;
else
- return 0;
+ return false;
}
/* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE)
- return 1;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * We know VM_NORESERVE is not set. Therefore, there SHOULD
+ * be a region map for all pages. The only situation where
+ * there is no region map is if a hole was punched via
+ * fallocate. In this case, there really are no reverves to
+ * use. This situation is indicated if chg != 0.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
/*
* Only the process that called mmap() has reserves for
* private mappings.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return 1;
+ return true;
- return 0;
+ return false;
}
static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
}
}
+
/*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map. If the page allocation fails,
+ * the reservation must be ended instead of committed. vma_end_reservation
+ * is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*/
+enum vma_resv_mode {
+ VMA_NEEDS_RESV,
+ VMA_COMMIT_RESV,
+ VMA_END_RESV,
+};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
- bool commit)
+ enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- if (commit)
- ret = region_add(resv, idx, idx + 1);
- else
+ switch (mode) {
+ case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1);
+ break;
+ case VMA_COMMIT_RESV:
+ ret = region_add(resv, idx, idx + 1);
+ break;
+ case VMA_END_RESV:
+ region_abort(resv, idx, idx + 1);
+ ret = 0;
+ break;
+ default:
+ BUG();
+ }
if (vma->vm_flags & VM_MAYSHARE)
return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, false);
+ return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, true);
+ return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+static void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
+}
+
+struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg, commit;
+ long map_chg, map_commit;
+ long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no
- * reserves and will not have accounted against subpool
- * limit. Check that the subpool limit can be made before
- * satisfying the allocation MAP_NORESERVE mappings may also
- * need pages and subpool limit allocated allocated if no reserve
- * mapping overlaps.
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
*/
- chg = vma_needs_reservation(h, vma, addr);
- if (chg < 0)
+ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+ if (map_chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves as indicated by the region/reserve map. Check
+ * that the allocation will not exceed the subpool limit.
+ * Allocations for MAP_NORESERVE mappings also need to be
+ * checked against any subpool limit.
+ */
+ if (map_chg || avoid_reserve) {
+ gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ if (gbl_chg < 0) {
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
+ }
+
+ /*
+ * Even though there was no reservation in the region/reserve
+ * map, there could be reservations associated with the
+ * subpool that can be used. This would be indicated if the
+ * return value of hugepage_subpool_get_pages() is zero.
+ * However, if avoid_reserve is specified we still avoid even
+ * the subpool reservations.
+ */
+ if (avoid_reserve)
+ gbl_chg = 1;
+ }
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ /*
+ * glb_chg is passed to indicate whether or not a page must be taken
+ * from the global free pool (global change). gbl_chg == 0 indicates
+ * a reservation exists for the allocation.
+ */
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(chg > commit)) {
+ map_commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_subpool_put:
- if (chg || avoid_reserve)
+ if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
}
kobject_put(hugepages_kobj);
- kfree(htlb_fault_mutex_table);
+ kfree(hugetlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
#else
num_fault_mutexes = 1;
#endif
- htlb_fault_mutex_table =
+ hugetlb_fault_mutex_table =
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
- BUG_ON(!htlb_fault_mutex_table);
+ BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
- mutex_init(&htlb_fault_mutex_table[i]);
+ mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx)
+{
+ struct inode *inode = mapping->host;
+ struct hstate *h = hstate_inode(inode);
+ int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+ if (err)
+ return err;
+ ClearPagePrivate(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@ retry:
set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
- int err;
- struct inode *inode = mapping->host;
-
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
- ClearPagePrivate(page);
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += blocks_per_huge_page(h);
- spin_unlock(&inode->i_lock);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@ retry:
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
+ }
ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
@@ -3280,7 +3520,7 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&htlb_fault_mutex_table[hash]);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM;
goto out_mutex;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
- mutex_unlock(&htlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
}
return 0;
out_err:
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
}
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
- if (resv_map)
- chg = region_truncate(resv_map, offset);
+ if (resv_map) {
+ chg = region_del(resv_map, start, end);
+ /*
+ * region_del() can fail in the rare case where a region
+ * must be split and another region descriptor can not be
+ * allocated. If end == LONG_MAX, it will not fail.
+ */
+ if (chg < 0)
+ return chg;
+ }
+
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
+
+ return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3762,10 +4018,12 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
+ unsigned long vm_flags;
+ unsigned long svm_flags;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+ vm_flags = vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
+ svm_flags = svma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
/*
* match the virtual addresses, permission and the alignment of the
@@ -3779,7 +4037,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
return saddr;
}
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
@@ -3789,8 +4047,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
+ return true;
+ return false;
}
/*
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac17dad4..5015679014c1 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
/*
* do a racy check with elevated page count, to make sure PG_hwpoison
* will only be set for the targeted owner (or on a free page).
- * We temporarily take page lock for try_get_mem_cgroup_from_page().
* memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(hpage);
err = hwpoison_filter(hpage);
- unlock_page(hpage);
if (err)
goto put_out;
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
if (!dentry)
goto fail;
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
hwpoison_dir, &hwpoison_filter_memcg);
if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..7c96e4723fba 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -246,10 +246,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, vm_flags_t to_drop);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
- munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+ munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end,
+ VM_LOCKED | VM_LOCKONFAULT);
}
/*
@@ -426,4 +427,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f110157c..b034c620957f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
struct kmemleak_object *object)
{
const u8 *ptr = (const u8 *)object->pointer;
- int i, len, remaining;
- unsigned char linebuf[HEX_ROW_SIZE * 5];
+ size_t len;
/* limit the number of lines to HEX_MAX_LINES */
- remaining = len =
- min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
-
- seq_printf(seq, " hex dump (first %d bytes):\n", len);
- for (i = 0; i < len; i += HEX_ROW_SIZE) {
- int linelen = min(remaining, HEX_ROW_SIZE);
-
- remaining -= HEX_ROW_SIZE;
- hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
- HEX_ASCII);
- seq_printf(seq, " %s\n", linebuf);
- }
+ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+ seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101eaacdf..fc6dbdf4e571 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1058,7 +1058,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
err = replace_page(vma, page, kpage, orig_pte);
}
- if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+ if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
@@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..411a61509adf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,14 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+
+struct madvise_free_private {
+ struct vm_area_struct *vma;
+ struct mmu_gather *tlb;
+};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -103,7 +112,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
@@ -255,6 +265,164 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct madvise_free_private *fp = walk->private;
+ struct mmu_gather *tlb = fp->tlb;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = fp->vma;
+ spinlock_t *ptl;
+ pte_t *pte, ptent;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long next;
+ int nr_swap = 0;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageSwapCache(page)) {
+ if (!trylock_page(page))
+ continue;
+
+ if (!try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+next:
+ cond_resched();
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_free_private fp = {
+ .vma = vma,
+ .tlb = tlb,
+ };
+
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = &fp,
+ };
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (vma->vm_file)
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -279,7 +447,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ if (vma->vm_flags & (VM_LOCKED|VM_LOCKONFAULT|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
zap_page_range(vma, start, end - start, NULL);
@@ -300,7 +468,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
return -EINVAL;
f = vma->vm_file;
@@ -378,6 +546,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -385,7 +561,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
}
-static int
+static bool
madvise_behavior_valid(int behavior)
{
switch (behavior) {
@@ -397,6 +573,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -407,10 +584,10 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 87108e77e476..509255223688 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
break;
}
- return (i < type->cnt) ? i : -1;
+ return i < type->cnt;
}
/*
@@ -566,6 +566,10 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
+ WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
@@ -1562,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
* Check if the region [@base, @base+@size) intersects a reserved memory block.
*
* RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
*/
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ return memblock_overlaps_region(&memblock.reserved, base, size);
}
void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..65aadf9fd967 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
"unevictable",
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
- MEM_CGROUP_NTARGETS,
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
-struct mem_cgroup_stat_cpu {
- long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEMCG_NR_EVENTS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
- struct mem_cgroup *position;
- /* scan generation, increased every round-trip */
- unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
- struct lruvec lruvec;
- unsigned long lru_size[NR_LRU_LISTS];
-
- struct reclaim_iter iter[DEF_PRIORITY + 1];
-
- struct rb_node tree_node; /* RB tree node */
- unsigned long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
- struct mem_cgroup *memcg; /* Back pointer, we cannot */
- /* use container_of */
-};
-
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
- struct eventfd_ctx *eventfd;
- unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below or equal to usage. */
- int current_threshold;
- /* Size of entries[] */
- unsigned int size;
- /* Array of thresholds */
- struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
- /* Primary thresholds array */
- struct mem_cgroup_threshold_ary *primary;
- /*
- * Spare threshold array.
- * This is needed to make mem_cgroup_unregister_event() "never fail".
- * It must be able to store at least primary->size - 1 entries.
- */
- struct mem_cgroup_threshold_ary *spare;
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
- struct cgroup_subsys_state css;
-
- /* Accounted resources */
- struct page_counter memory;
- struct page_counter memsw;
- struct page_counter kmem;
-
- /* Normal memory consumption range */
- unsigned long low;
- unsigned long high;
-
- unsigned long soft_limit;
-
- /* vmpressure notifications */
- struct vmpressure vmpressure;
-
- /* css_online() has been completed */
- int initialized;
-
- /*
- * Should the accounting and control be hierarchical, per subtree?
- */
- bool use_hierarchy;
-
- /* protected by memcg_oom_lock */
- bool oom_lock;
- int under_oom;
-
- int swappiness;
- /* OOM-Killer disable */
- int oom_kill_disable;
-
- /* protect arrays of thresholds */
- struct mutex thresholds_lock;
-
- /* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_thresholds thresholds;
-
- /* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_thresholds memsw_thresholds;
-
- /* For oom notifier event fd */
- struct list_head oom_notify;
-
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- struct task_struct *move_lock_task;
- unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params.memcg_caches array */
- int kmemcg_id;
- bool kmem_acct_activated;
- bool kmem_acct_active;
-#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct list_head cgwb_list;
- struct wb_domain cgwb_domain;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
- return memcg->kmem_acct_active;
-}
-#endif
-
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
+ if (cg_proto && memcg_proto_active(cg_proto) &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return &memcg->css;
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -631,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
return &memcg->css;
}
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ino = 0;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(page->mem_cgroup);
+ while (memcg && !(memcg->css.flags & CSS_ONLINE))
+ memcg = parent_mem_cgroup(memcg);
+ if (memcg)
+ ino = cgroup_ino(memcg->css.cgroup);
+ rcu_read_unlock();
+ return ino;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -876,14 +714,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_zone *mz;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- return mz->lru_size[lru];
-}
-
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid,
unsigned int lru_mask)
@@ -986,6 +816,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
+EXPORT_SYMBOL(mem_cgroup_from_task);
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1031,7 +862,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1173,30 +1004,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- goto out;
-
- switch (idx) {
- case PGFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
- break;
- case PGMAJFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
- break;
- default:
- BUG();
- }
-out:
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
@@ -1295,15 +1102,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
- if (root == memcg)
- return true;
- if (!root->use_hierarchy)
- return false;
- return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
@@ -1330,39 +1128,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1159,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
return margin;
}
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}
-
/*
* A routine for checking "mem" is under move_account() or not.
*
@@ -1545,6 +1301,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1325,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1333,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1371,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
@@ -2062,23 +1823,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val)
-{
- VM_BUG_ON(!rcu_read_lock_held());
-
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
-}
-
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2355,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges. If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg;
- unsigned short id;
- swp_entry_t ent;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- memcg = page->mem_cgroup;
- if (memcg) {
- if (!css_tryget_online(&memcg->css))
- memcg = NULL;
- } else if (PageSwapCache(page)) {
- ent.val = page_private(page);
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
- return memcg;
-}
-
static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);
@@ -2504,16 +2214,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
- return memcg ? memcg->kmemcg_id : -1;
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -4194,20 +3894,23 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
struct fd efile;
struct fd cfile;
const char *name;
- char *endp;
int ret;
buf = strstrip(buf);
- efd = simple_strtoul(buf, &endp, 10);
- if (*endp != ' ')
+ ret = parse_integer(buf, 10, &efd);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ if (*buf++ != ' ')
return -EINVAL;
- buf = endp + 1;
-
- cfd = simple_strtoul(buf, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
+ ret = parse_integer(buf, 10, &cfd);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ if (*buf != ' ' && *buf != '\0')
return -EINVAL;
- buf = endp + 1;
+ buf++;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
@@ -5127,10 +4830,12 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
- int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *from;
+ struct task_struct *p;
+ struct mm_struct *mm;
unsigned long move_flags;
+ int ret = 0;
/*
* We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4843,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (move_flags) {
- struct mm_struct *mm;
- struct mem_cgroup *from = mem_cgroup_from_task(p);
+ if (!move_flags)
+ return 0;
- VM_BUG_ON(from == memcg);
+ p = cgroup_taskset_first(tset);
+ from = mem_cgroup_from_task(p);
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- }
- mmput(mm);
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
}
+ mmput(mm);
return ret;
}
@@ -5521,19 +5227,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx,
- unsigned int nr)
-{
- this_cpu_add(memcg->stat->events[idx], nr);
-}
-
-/**
* mem_cgroup_low - check if memory consumption is below the normal range
* @root: the highest ancestor to consider
* @memcg: the memory cgroup to check
@@ -5605,8 +5298,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page->mem_cgroup)
goto out;
+
+ if (do_swap_account) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id = lookup_swap_cgroup_id(ent);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
}
if (PageTransHuge(page)) {
@@ -5614,8 +5319,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
- if (do_swap_account && PageSwapCache(page))
- memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5965,7 +5668,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* Caller disabled preemption with mapping->tree_lock */
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * mapping->tree_lock lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for udpating the per-CPU variables.
+ */
+ VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c53543d89282..0a274359d8eb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
{
- struct mem_cgroup *mem;
- struct cgroup_subsys_state *css;
- unsigned long ino;
-
if (!hwpoison_filter_memcg)
return 0;
- mem = try_get_mem_cgroup_from_page(p);
- if (!mem)
- return -EINVAL;
-
- css = mem_cgroup_css(mem);
- ino = cgroup_ino(css->cgroup);
- css_put(css);
-
- if (ino != hwpoison_filter_memcg)
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -909,6 +897,15 @@ int get_hwpoison_page(struct page *page)
* directly for tail pages.
*/
if (PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch a "partially handled" page.
+ */
+ if (!PageAnon(head))
+ return 0;
+
if (get_page_unless_zero(head)) {
if (PageTail(page))
get_page(page);
@@ -1134,15 +1131,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
- if (!PageAnon(hpage)) {
- pr_err("MCE: %#lx: non anonymous thp\n", pfn);
- if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
- return -EBUSY;
- }
if (unlikely(split_huge_page(hpage))) {
pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
@@ -1159,7 +1147,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1209,9 +1197,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
put_page(hpage);
- res = 0;
- goto out;
+ return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
@@ -1671,8 +1659,8 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1723,6 +1711,9 @@ int soft_offline_page(struct page *page, int flags)
get_online_mems();
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(page, true);
+
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0) { /* for in-use pages */
@@ -1730,7 +1721,7 @@ int soft_offline_page(struct page *page, int flags)
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
- } else if (ret == 0) { /* for free pages */
+ } else if (ret == 0) {
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
if (!dequeue_hwpoisoned_huge_page(hpage))
@@ -1741,5 +1732,6 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9aa283..6ff2fb10f37e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
#include <linux/string.h>
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
#ifdef HAVE_GENERIC_MMU_GATHER
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
- return 1;
+ return true;
}
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
- return 0;
+ return false;
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
- return 0;
+ return false;
tlb->batch_count++;
batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
tlb->active->next = batch;
tlb->active = batch;
- return 1;
+ return true;
}
/* tlb_gather_mmu
@@ -2165,7 +2166,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags &
+ (VM_LOCKED | VM_LOCKONFAULT))) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
@@ -2577,7 +2579,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (vm_swap_full() || (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) ||
+ PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
@@ -2685,6 +2688,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
goto setpte;
}
@@ -2713,6 +2722,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ page_cache_release(page);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
+
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
@@ -3073,7 +3091,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3216,6 +3234,27 @@ out:
return 0;
}
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+ unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3251,12 +3290,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops)
+ if (vma_is_anonymous(vma))
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, flags);
+ else
return do_fault(mm, vma, address, pte, pmd,
flags, entry);
-
- return do_anonymous_page(mm, vma, address, pte, pmd,
- flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3318,10 +3357,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
@@ -3345,8 +3381,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
- ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
- orig_pmd);
+ ret = wp_huge_pmd(mm, vma, address, pmd,
+ orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26fbba7d888f..1cbd70ab3e76 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1333,7 +1333,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1344,10 +1344,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
for (pfn = start_pfn;
pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
+ /* Find the first valid pfn in this pageblock */
+ for (i = 0; i < MAX_ORDER_NR_PAGES; i++) {
+ if (pfn_valid(pfn + i))
+ break;
+ }
if (i == MAX_ORDER_NR_PAGES)
continue;
page = pfn_to_page(pfn + i);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..d6f2caee28c0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
qp->prev = vma;
- if (vma->vm_flags & VM_PFNMAP)
- return 1;
-
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -722,8 +719,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol, vma->vm_userfaultfd_ctx);
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de8b1db..4c533bc51d73 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
*/
void mempool_destroy(mempool_t *pool)
{
+ if (unlikely(!pool))
+ return;
+
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc133f6d7..4b4f36b46371 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <linux/pfn.h>
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
+ pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
+ cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
}
@@ -79,22 +72,20 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
+ pr_info(" %pa - %pa pattern %016llx\n",
+ &this_start, &this_end, cpu_to_be64(pattern));
memtest(pattern, this_start, this_end - this_start);
}
}
}
/* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
+ parse_integer(arg, 0, (unsigned int *)&memtest_pattern);
else
memtest_pattern = ARRAY_SIZE(patterns);
@@ -111,7 +102,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
if (!memtest_pattern)
return;
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4e5ef1..947f36a41f30 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -524,6 +524,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
+ if (page_is_young(page))
+ set_page_young(newpage);
+ if (page_is_idle(page))
+ set_page_idle(newpage);
+
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
@@ -918,8 +923,7 @@ out:
static ICE_noinline int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
- int force, enum migrate_mode mode,
- enum migrate_reason reason)
+ int force, enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -950,8 +954,7 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (reason != MR_MEMORY_FAILURE)
- putback_lru_page(page);
+ putback_lru_page(page);
}
/*
@@ -1124,8 +1127,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
pass > 2, mode);
else
rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode,
- reason);
+ private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
@@ -1222,7 +1224,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, pp->addr,
+ FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1232,10 +1236,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- /* Use PageReserved to check for zero page */
- if (PageReserved(page))
- goto put_and_set;
-
pp->page = page;
err = page_to_nid(page);
@@ -1392,18 +1392,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (!vma || addr < vma->vm_start)
goto set_status;
- page = follow_page(vma, addr, 0);
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, addr, FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- err = -ENOENT;
- /* Use PageReserved to check for zero page */
- if (!page || PageReserved(page))
- goto set_status;
-
- err = page_to_nid(page);
+ err = page ? page_to_nid(page) : -ENOENT;
set_status:
*status = err;
@@ -1749,7 +1745,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..e5590c336413 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -406,23 +406,22 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
* @vma - vma containing range to be munlock()ed.
* @start - start address in @vma of the range
* @end - end of range in @vma.
+ * @to_drop - the VMA flags we want to drop from the specified range
*
- * For mremap(), munmap() and exit().
+ * For mremap(), munmap(), munlock(), and exit().
*
- * Called with @vma VM_LOCKED.
- *
- * Returns with VM_LOCKED cleared. Callers must be prepared to
+ * Returns with specified flags cleared. Callers must be prepared to
* deal with this.
*
- * We don't save and restore VM_LOCKED here because pages are
+ * We don't save and restore specified flags here because pages are
* still on lru. In unmap path, pages might be scanned by reclaim
* and re-mlocked by try_to_{munlock|unmap} before we unmap and
* free them. This will result in freeing mlocked pages.
*/
-void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, vm_flags_t to_drop)
{
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= ~to_drop;
while (start < end) {
struct page *page = NULL;
@@ -502,15 +501,17 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = !!(newflags & VM_LOCKED);
+ int lock = !!(newflags & (VM_LOCKED | VM_LOCKONFAULT));
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
- goto out; /* don't set VM_LOCKED, don't count */
+ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+ goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
@@ -546,14 +547,19 @@ success:
if (lock)
vma->vm_flags = newflags;
else
- munlock_vma_pages_range(vma, start, end);
+ /*
+ * We need to tell which VM_LOCK* flag(s) we are clearing here
+ */
+ munlock_vma_pages_range(vma, start, end,
+ (vma->vm_flags & ~(newflags)));
out:
*prev = vma;
return ret;
}
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_flags(unsigned long start, size_t len,
+ vm_flags_t flags, bool add_flags)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * prev;
@@ -579,9 +585,13 @@ static int do_mlock(unsigned long start, size_t len, int on)
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (on)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags;
+ if (add_flags) {
+ newflags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ newflags |= flags;
+ } else {
+ newflags &= ~flags;
+ }
tmp = vma->vm_end;
if (tmp > end)
@@ -604,7 +614,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
unsigned long lock_limit;
@@ -628,19 +638,42 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = do_mlock(start, len, 1);
+ error = apply_vma_flags(start, len, flags, true);
up_write(&current->mm->mmap_sem);
if (error)
return error;
- error = __mm_populate(start, len, 0);
- if (error)
- return __mlock_posix_error_return(error);
+ if (flags & (VM_LOCKED | VM_LOCKONFAULT)) {
+ if (flags & VM_LOCKED)
+ error = __mm_populate(start, len, 0);
+ else
+ error = mm_lock_present(start, len);
+ if (error)
+ return __mlock_posix_error_return(error);
+ }
+
return 0;
}
-SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+ if (!flags || (flags & ~(MLOCK_LOCKED | MLOCK_ONFAULT)) ||
+ flags == (MLOCK_LOCKED | MLOCK_ONFAULT))
+ return -EINVAL;
+
+ if (flags & MLOCK_LOCKED)
+ return do_mlock(start, len, VM_LOCKED);
+
+ return do_mlock(start, len, VM_LOCKONFAULT);
+}
+
+static int do_munlock(unsigned long start, size_t len, vm_flags_t flags)
{
int ret;
@@ -648,29 +681,54 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
start &= PAGE_MASK;
down_write(&current->mm->mmap_sem);
- ret = do_mlock(start, len, 0);
+ ret = apply_vma_flags(start, len, flags, false);
up_write(&current->mm->mmap_sem);
return ret;
}
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+{
+ return do_munlock(start, len, VM_LOCKED | VM_LOCKONFAULT);
+}
+
+SYSCALL_DEFINE3(munlock2, unsigned long, start, size_t, len, int, flags)
+{
+ vm_flags_t to_clear = 0;
+
+ if (!flags || flags & ~(MLOCK_LOCKED | MLOCK_ONFAULT))
+ return -EINVAL;
+
+ if (flags & MLOCK_LOCKED)
+ to_clear |= VM_LOCKED;
+ if (flags & MLOCK_ONFAULT)
+ to_clear |= VM_LOCKONFAULT;
+
+ return do_munlock(start, len, to_clear);
+}
+
static int do_mlockall(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
+ vm_flags_t to_add;
if (flags & MCL_FUTURE)
current->mm->def_flags |= VM_LOCKED;
- else
- current->mm->def_flags &= ~VM_LOCKED;
if (flags == MCL_FUTURE)
goto out;
+ if (flags & MCL_ONFAULT) {
+ current->mm->def_flags |= VM_LOCKONFAULT;
+ to_add = VM_LOCKONFAULT;
+ } else {
+ to_add = VM_LOCKED;
+ }
+
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
vm_flags_t newflags;
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (flags & MCL_CURRENT)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
+ newflags |= to_add;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -685,7 +743,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
unsigned long lock_limit;
int ret = -EINVAL;
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
+ (flags & (MCL_FUTURE | MCL_ONFAULT)) == (MCL_FUTURE | MCL_ONFAULT))
goto out;
ret = -EPERM;
@@ -711,12 +770,55 @@ out:
return ret;
}
+static int do_munlockall(int flags)
+{
+ struct vm_area_struct *vma, *prev = NULL;
+ vm_flags_t to_clear = 0;
+
+ if (flags & MCL_FUTURE)
+ current->mm->def_flags &= ~VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ current->mm->def_flags &= ~VM_LOCKONFAULT;
+ if (flags == MCL_FUTURE)
+ goto out;
+
+ if (flags & MCL_CURRENT)
+ to_clear |= VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ to_clear |= VM_LOCKONFAULT;
+
+ for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ vm_flags_t newflags;
+
+ newflags = vma->vm_flags & ~to_clear;
+
+ /* Ignore errors */
+ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ cond_resched_rcu_qs();
+ }
+out:
+ return 0;
+}
+
SYSCALL_DEFINE0(munlockall)
{
int ret;
down_write(&current->mm->mmap_sem);
- ret = do_mlockall(0);
+ ret = do_munlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT);
+ up_write(&current->mm->mmap_sem);
+ return ret;
+}
+
+SYSCALL_DEFINE1(munlockall2, int, flags)
+{
+ int ret = -EINVAL;
+
+ if (!flags || flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))
+ return ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_munlockall(flags);
up_write(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..a10cdf0747be 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end);
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ return 0;
return 1;
}
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ anon_vma, file, pgoff,
+ vm_userfaultfd_ctx)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ anon_vma, file,
+ pgoff+pglen,
+ vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
@@ -1232,8 +1245,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
{
unsigned long locked, lock_limit;
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
+ /* mlock MCL_FUTURE or MCL_ONFAULT? */
+ if (flags & (VM_LOCKED | VM_LOCKONFAULT)) {
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -1301,7 +1314,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED)
+ if (flags & (MAP_LOCKED | MAP_LOCKONFAULT))
if (!can_do_mlock())
return -EPERM;
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
- NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -1646,12 +1659,12 @@ out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
+ if (vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
}
if (file)
@@ -2104,7 +2117,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* mlock limit tests */
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
@@ -2128,7 +2141,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* Ok, everything looks good - let it rip */
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
@@ -2583,7 +2596,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (mm->locked_vm) {
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
- if (tmp->vm_flags & VM_LOCKED) {
+ if (tmp->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
@@ -2636,6 +2649,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
+ vm_flags_t drop_lock_flag = 0;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
"See Documentation/vm/remap_file_pages.txt.\n",
@@ -2675,10 +2689,18 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED) {
flags |= MAP_LOCKED;
- /* drop PG_Mlocked flag for over-mapped range */
- munlock_vma_pages_range(vma, start, start + size);
+ drop_lock_flag = VM_LOCKED;
+ } else if (vma->vm_flags & VM_LOCKONFAULT) {
+ flags |= MAP_LOCKONFAULT;
+ drop_lock_flag = VM_LOCKONFAULT;
}
+
+ if (drop_lock_flag)
+ /* drop PG_Mlocked flag for over-mapped range */
+ munlock_vma_pages_range(vma, start, start + size,
+ drop_lock_flag);
+
file = get_file(vma->vm_file);
ret = do_mmap_pgoff(vma->vm_file, start, size,
prot, flags, pgoff, &populate);
@@ -2757,7 +2779,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -2781,7 +2803,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
+ if (flags & (VM_LOCKED | VM_LOCKONFAULT))
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return addr;
@@ -2816,7 +2838,7 @@ void exit_mmap(struct mm_struct *mm)
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
munlock_vma_pages_all(vma);
vma = vma->vm_next;
}
@@ -2871,7 +2893,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
- if (!vma->vm_file) {
+ if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
@@ -2905,7 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
@@ -2913,7 +2935,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3013,21 +3036,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- /*
- * special mappings have no vm_file, and in that case, the mm
- * uses vm_pgoff internally. So we have to subtract it from here.
- * We are allowed to do this because we are the mm; do not copy
- * this code into drivers!
- */
- pgoff = vmf->pgoff - vma->vm_pgoff;
-
if (vma->vm_ops == &legacy_special_mapping_vmops)
pages = vma->vm_private_data;
else
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
pages;
- for (; pgoff && *pages; ++pages)
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->clear_young)
+ young |= mn->ops->clear_young(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*pprev) {
vma = *pprev;
goto success;
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c93eceb1c8..6769cb228953 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
+ err = -ENOMEM;
+ } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ err = vma->vm_ops->mremap(new_vma);
+ }
+
+ if (unlikely(err)) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
- new_addr = -ENOMEM;
+ new_addr = err;
} else {
- if (vma->vm_file && vma->vm_file->f_op->mremap) {
- err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
- if (err < 0) {
- move_page_tables(new_vma, new_addr, vma,
- old_addr, moved_len, true);
- return err;
- }
- }
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
}
@@ -335,7 +333,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}
- if (vm_flags & VM_LOCKED) {
+ if (vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
}
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = find_vma(mm, addr);
+ unsigned long pgoff;
if (!vma || vma->vm_start > addr)
return ERR_PTR(-EFAULT);
@@ -359,19 +358,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
return ERR_PTR(-EFAULT);
+ if (new_len == old_len)
+ return vma;
+
/* Need to be careful about a growing mapping */
- if (new_len > old_len) {
- unsigned long pgoff;
-
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return ERR_PTR(-EFAULT);
- pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgoff += vma->vm_pgoff;
- if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return ERR_PTR(-EINVAL);
- }
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ return ERR_PTR(-EINVAL);
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ return ERR_PTR(-EFAULT);
+
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
goto out;
- /* Check if the location we're moving into overlaps the
- * old location at all, and fail if it does.
- */
- if ((new_addr <= addr) && (new_addr+new_len) > addr)
- goto out;
-
- if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ /* Ensure the old/new locations do not overlap */
+ if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
ret = do_munmap(mm, new_addr, new_len);
@@ -548,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
mm->locked_vm += pages;
locked = true;
new_addr = addr;
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
}
out:
- if (ret & ~PAGE_MASK)
+ if (ret & ~PAGE_MASK) {
vm_unacct_memory(charged);
+ locked = 0;
+ }
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
diff --git a/mm/msync.c b/mm/msync.c
index bb04d53ae852..fd585e93a423 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
}
/* Here vma->vm_start <= start < vma->vm_end. */
if ((flags & MS_INVALIDATE) &&
- (vma->vm_flags & VM_LOCKED)) {
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))) {
error = -EBUSY;
goto out_unlock;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..1ecc0bcaecc5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;
- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;
- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}
/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
- int killed = 0;
if (oom_killer_disabled)
return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- goto out;
+ return true;
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
- goto out;
+ return true;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
- goto out;
+ return true;
}
- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ if (!p && oc->order != -1) {
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
- killed = 1;
- }
-out:
- /*
- * Give the killed threads a good chance of exiting before trying to
- * allocate memory again.
- */
- if (killed)
+ if (p && p != (void *)-1UL) {
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
schedule_timeout_killable(1);
-
+ }
return true;
}
@@ -728,13 +711,20 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;
if (!mutex_trylock(&oom_lock))
return;
- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef19f22b2b7d..2024d2edc664 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
-#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -126,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
@@ -444,6 +461,7 @@ void prep_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
p->first_page = page;
/* Make sure p->first_page is always valid for PageTail() */
smp_wmb();
@@ -789,7 +807,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- mt = get_freepage_migratetype(page);
+
+ mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
@@ -822,6 +844,12 @@ static void free_one_page(struct zone *zone,
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ page->mapping = NULL;
+ return 1;
+ }
+ page->mapping = NULL;
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return 0;
if (unlikely(!PageTail(page))) {
@@ -953,7 +981,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- set_freepage_migratetype(page, migratetype);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -981,21 +1008,21 @@ static void __init __free_pages_boot_core(struct page *page,
#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
- /* The system will behave unpredictably otherwise */
- BUG_ON(system_state != SYSTEM_BOOTING);
-
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
}
#endif
@@ -1060,7 +1087,15 @@ static void __init deferred_free_range(struct page *page,
__free_pages_boot_core(page, pfn, 0);
}
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
@@ -1077,7 +1112,7 @@ static int __init deferred_init_memmap(void *data)
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (first_init_pfn == ULONG_MAX) {
- up_read(&pgdat_init_rwsem);
+ pgdat_init_report_one_done();
return 0;
}
@@ -1177,7 +1212,8 @@ free_range:
pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
jiffies_to_msecs(jiffies - start));
- up_read(&pgdat_init_rwsem);
+
+ pgdat_init_report_one_done();
return 0;
}
@@ -1185,14 +1221,17 @@ void __init page_alloc_init_late(void)
{
int nid;
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
- down_read(&pgdat_init_rwsem);
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
- down_write(&pgdat_init_rwsem);
- up_write(&pgdat_init_rwsem);
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -1285,6 +1324,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -1362,7 +1405,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -1439,7 +1482,6 @@ int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
- set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1609,14 +1651,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
expand(zone, page, order, current_order, area,
start_migratetype);
/*
- * The freepage_migratetype may differ from pageblock's
+ * The pcppage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
+ * find_suitable_fallback(). This is OK as long as it does not
+ * differ for MIGRATE_CMA pageblocks. Those can be used as
+ * fallback only via special __rmqueue_cma_fallback() function
*/
- set_freepage_migratetype(page, start_migratetype);
+ set_pcppage_migratetype(page, start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -1692,7 +1733,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1889,7 +1930,7 @@ void free_hot_cold_page(struct page *page, bool cold)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -2094,7 +2135,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pcppage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2675,6 +2716,12 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct page *page;
*did_some_progress = 0;
@@ -2726,8 +2773,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);
@@ -5277,8 +5323,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
*
* NOTE: pgdat should get zeroed by caller.
*/
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5441,7 +5486,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn);
+ free_area_init_core(pgdat);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5452,11 +5497,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
*/
void __init setup_nr_node_ids(void)
{
- unsigned int node;
- unsigned int highest = 0;
+ unsigned int highest;
- for_each_node_mask(node, node_possible_map)
- highest = node;
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
@@ -6397,7 +6440,7 @@ static int __init set_hashdist(char *str)
{
if (!str)
return 0;
- hashdist = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, (unsigned int *)&hashdist);
return 1;
}
__setup("hashdist=", set_hashdist);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d86fd2f5353f..e4b3af054bf2 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,6 +59,9 @@ static struct page_ext_operations *page_ext_ops[] = {
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+ &page_idle_ops,
+#endif
};
static unsigned long total_usage;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790ef..9eaa489cb456 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -178,8 +178,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page)
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
@@ -223,34 +226,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page)) {
+ if (PageBuddy(page))
/*
- * If race between isolatation and allocation happens,
- * some free pages could be in MIGRATE_MOVABLE list
- * although pageblock's migratation type of the page
- * is MIGRATE_ISOLATE. Catch it and move the page into
- * MIGRATE_ISOLATE list.
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
*/
- if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
- struct page *end_page;
-
- end_page = page + (1 << page_order(page)) - 1;
- move_freepages(page_zone(page), page, end_page,
- MIGRATE_ISOLATE);
- }
pfn += 1 << page_order(page);
- }
- else if (page_count(page) == 0 &&
- get_freepage_migratetype(page) == MIGRATE_ISOLATE)
- pfn += 1;
- else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
- /*
- * The HWPoisoned page may be not in buddy
- * system, and page_count() is not 0.
- */
+ else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- continue;
- }
else
break;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..4a5bb34676fc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
#include <asm/tlbflush.h>
+#include <trace/events/tlb.h>
+
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
return address;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+ /*
+ * All TLB entries are flushed on the assumption that it is
+ * cheaper to flush all TLBs and let them be refilled than
+ * flushing individual PFNs. Note that we do not track mm's
+ * to flush as that might simply be multiple full TLB flushes
+ * for no gain.
+ */
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+ int cpu;
+
+ if (!tlb_ubc->flush_required)
+ return;
+
+ cpu = get_cpu();
+
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+ percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+ smp_call_function_many(&tlb_ubc->cpumask,
+ percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ }
+ cpumask_clear(&tlb_ubc->cpumask);
+ tlb_ubc->flush_required = false;
+ tlb_ubc->writable = false;
+ put_cpu();
+}
+
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ if (tlb_ubc->writable)
+ try_to_unmap_flush();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page, bool writable)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ tlb_ubc->flush_required = true;
+
+ /*
+ * If the PTE was dirty then it's best to assume it's writable. The
+ * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+ * before the page is queued for IO.
+ */
+ if (writable)
+ tlb_ubc->writable = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ bool should_defer = false;
+
+ if (!(flags & TTU_BATCH_FLUSH))
+ return false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page, bool writable)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* At what user virtual address is page expected in vma?
* Caller should check the page is actually part of the vma.
@@ -714,6 +817,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
}
struct page_referenced_arg {
+ int dirtied;
int mapcount;
int referenced;
unsigned long vm_flags;
@@ -728,6 +832,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ int dirty = 0;
struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
@@ -742,15 +847,25 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (!pmd)
return SWAP_AGAIN;
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
+ pra->vm_flags |= (vma->vm_flags &
+ (VM_LOCKED | VM_LOCKONFAULT));
return SWAP_FAIL; /* To break the loop */
}
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+
+ /*
+ * Use pmd_freeable instead of raw pmd_dirty because in some
+ * of architecture, pmd_dirty is not defined unless
+ * CONFIG_TRANSPARENT_HUGEPAGE is enabled
+ */
+ if (!pmd_freeable(*pmd))
+ dirty++;
+
spin_unlock(ptl);
} else {
pte_t *pte;
@@ -763,9 +878,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (!pte)
return SWAP_AGAIN;
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
+ pra->vm_flags |= (vma->vm_flags &
+ (VM_LOCKED | VM_LOCKONFAULT));
return SWAP_FAIL; /* To break the loop */
}
@@ -780,14 +896,26 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
+
+ if (pte_dirty(*pte))
+ dirty++;
+
pte_unmap_unlock(pte, ptl);
}
+ if (referenced)
+ clear_page_idle(page);
+ if (test_and_clear_page_young(page))
+ referenced++;
+
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags;
}
+ if (dirty)
+ pra->dirtied++;
+
pra->mapcount--;
if (!pra->mapcount)
return SWAP_SUCCESS; /* To break the loop */
@@ -812,6 +940,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
* @is_locked: caller holds lock on the page
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
@@ -819,7 +948,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
int ret;
int we_locked = 0;
@@ -834,6 +964,9 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
+
if (!page_mapped(page))
return 0;
@@ -861,6 +994,9 @@ int page_referenced(struct page *page,
if (we_locked)
unlock_page(page);
+ if (is_pte_dirty)
+ *is_pte_dirty = pra.dirtied;
+
return pra.referenced;
}
@@ -1194,6 +1330,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ int dirty = 0;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1205,7 +1342,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* skipped over this mm) then we should reactivate it.
*/
if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
goto out_mlock;
if (flags & TTU_MUNLOCK)
@@ -1220,10 +1357,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush(vma, address, pte);
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially a remote
+ * CPU could still be writing to the page. If the entry was
+ * previously clean then the architecture must guarantee that
+ * a clear->dirty transition on a cached TLB entry is written
+ * through and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pte);
+
+ set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pte);
+ }
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
+ dirty = pte_dirty(pteval);
+ if (dirty)
set_page_dirty(page);
/* Update high watermark before we lower rss */
@@ -1252,6 +1403,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
+ if (flags & TTU_FREE) {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!dirty && !PageDirty(page)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ } else {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
+
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
@@ -1292,6 +1456,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, MM_FILEPAGES);
+discard:
page_remove_rmap(page);
page_cache_release(page);
@@ -1315,7 +1480,7 @@ out_mlock:
* page is actually mlocked.
*/
if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) {
mlock_vma_page(page);
ret = SWAP_MLOCK;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 4caf8ed24d65..2865ee72d329 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -754,7 +754,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
index = page->index;
inode = mapping->host;
info = SHMEM_I(inode);
- if (info->flags & VM_LOCKED)
+ if (info->flags & (VM_LOCKED | VM_LOCKONFAULT))
goto redirty;
if (!total_swap_pages)
goto redirty;
@@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1173,7 +1173,7 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
@@ -2736,6 +2736,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
struct mempolicy *mpol = NULL;
uid_t uid;
gid_t gid;
+ int rv;
while (options != NULL) {
this_char = options;
@@ -2789,14 +2790,15 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
} else if (!strcmp(this_char,"mode")) {
if (remount)
continue;
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
- if (*rest)
+ rv = parse_integer(value, 8, &sbinfo->mode);
+ if (rv < 0 || value[rv])
goto bad_val;
+ sbinfo->mode &= 07777;
} else if (!strcmp(this_char,"uid")) {
if (remount)
continue;
- uid = simple_strtoul(value, &rest, 0);
- if (*rest)
+ rv = parse_integer(value, 0, &uid);
+ if (rv < 0 || value[rv])
goto bad_val;
sbinfo->uid = make_kuid(current_user_ns(), uid);
if (!uid_valid(sbinfo->uid))
@@ -2804,8 +2806,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
} else if (!strcmp(this_char,"gid")) {
if (remount)
continue;
- gid = simple_strtoul(value, &rest, 0);
- if (*rest)
+ rv = parse_integer(value, 0, &gid);
+ if (rv < 0 || value[rv])
goto bad_val;
sbinfo->gid = make_kgid(current_user_ns(), gid);
if (!gid_valid(sbinfo->gid))
@@ -3363,8 +3365,8 @@ put_path:
* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
* kernel internal. There will be NO LSM permission checks against the
* underlying inode. So users of this interface must do LSM checks at a
- * higher layer. The one user is the big_key implementation. LSM checks
- * are provided at the key level rather than the inode level.
+ * higher layer. The users are the big_key and shm implementations. LSM
+ * checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
diff --git a/mm/slab.c b/mm/slab.c
index 200e22412a16..ef6d21be3c76 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
#ifdef CONFIG_TRACING
void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..88b55497738c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_MEMCG_KMEM
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3e5f8f29c286..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -105,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
}
#endif
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++)
+ kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+ void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ }
+ return true;
+}
+
#ifdef CONFIG_MEMCG_KMEM
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -478,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct cgroup_subsys_state *css = &memcg->css;
struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
@@ -618,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
bool need_rcu_barrier = false;
bool busy = false;
+ if (unlikely(!s))
+ return;
+
BUG_ON(!is_root_cache(s));
get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
int __kmem_cache_shutdown(struct kmem_cache *c)
{
/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index 816df0016555..257283f09f93 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -1306,6 +1308,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kasan_slab_free(s, x);
}
+static void setup_object(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ setup_object_debug(s, page, object);
+ if (unlikely(s->ctor)) {
+ kasan_unpoison_object_data(s, object);
+ s->ctor(object);
+ kasan_poison_object_data(s, object);
+ }
+}
+
/*
* Slab allocation and freeing
*/
@@ -1336,6 +1349,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
+ void *start, *p;
+ int idx, order;
flags &= gfp_allowed_mask;
@@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Try a lower order alloc if possible
*/
page = alloc_slab_page(s, alloc_gfp, node, oo);
-
- if (page)
- stat(s, ORDER_FALLBACK);
+ if (unlikely(!page))
+ goto out;
+ stat(s, ORDER_FALLBACK);
}
- if (kmemcheck_enabled && page
- && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+ if (kmemcheck_enabled &&
+ !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kmemcheck_mark_unallocated_pages(page, pages);
}
- if (flags & __GFP_WAIT)
- local_irq_disable();
- if (!page)
- return NULL;
-
page->objects = oo_objects(oo);
- mod_zone_page_state(page_zone(page),
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- 1 << oo_order(oo));
-
- return page;
-}
-
-static void setup_object(struct kmem_cache *s, struct page *page,
- void *object)
-{
- setup_object_debug(s, page, object);
- if (unlikely(s->ctor)) {
- kasan_unpoison_object_data(s, object);
- s->ctor(object);
- kasan_poison_object_data(s, object);
- }
-}
-
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
- struct page *page;
- void *start;
- void *p;
- int order;
- int idx;
-
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
- BUG();
- }
-
- page = allocate_slab(s,
- flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
- if (!page)
- goto out;
order = compound_order(page);
- inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
@@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->freelist = start;
page->inuse = page->objects;
page->frozen = 1;
+
out:
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
+ if (!page)
+ return NULL;
+
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ 1 << oo_order(oo));
+
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+
return page;
}
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+ pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+ BUG();
+ }
+
+ return allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
static void __free_slab(struct kmem_cache *s, struct page *page)
{
int order = compound_order(page);
@@ -2712,7 +2709,7 @@ redo:
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
* data is retrieved via this pointer. If we are on the same cpu
- * during the cmpxchg then the free will succedd.
+ * during the cmpxchg then the free will succeed.
*/
do {
tid = this_cpu_read(s->cpu_slab->tid);
@@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct kmem_cache_cpu *c;
+ struct page *page;
+ int i;
+
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ BUG_ON(!object);
+ /* kmem cache debug support */
+ s = cache_from_obj(s, object);
+ if (unlikely(!s))
+ goto exit;
+ slab_free_hook(s, object);
+
+ page = virt_to_head_page(object);
+
+ if (c->page == page) {
+ /* Fastpath: local CPU free */
+ set_freepointer(s, object, c->freelist);
+ c->freelist = object;
+ } else {
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+ /* Slowpath: overhead locked cmpxchg_double_slab */
+ __slab_free(s, page, object, _RET_IP_);
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+ }
+ }
+exit:
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+/* Note that interrupts must be enabled when calling this function. */
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ struct kmem_cache_cpu *c;
+ int i;
+
+ /*
+ * Drain objects in the per cpu slab, while disabling local
+ * IRQs, which protects against PREEMPT and interrupts
+ * handlers invoking normal fastpath.
+ */
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ for (i = 0; i < size; i++) {
+ void *object = c->freelist;
+
+ if (unlikely(!object)) {
+ local_irq_enable();
+ /*
+ * Invoking slow path likely have side-effect
+ * of re-populating per CPU c->freelist
+ */
+ p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+ _RET_IP_, c);
+ if (unlikely(!p[i])) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+ continue; /* goto for-loop */
+ }
+
+ /* kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, flags);
+ if (unlikely(!s)) {
+ __kmem_cache_free_bulk(s, i, p);
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+ return false;
+ }
+
+ c->freelist = get_freepointer(s, object);
+ p[i] = object;
+
+ /* kmem_cache debug support */
+ slab_post_alloc_hook(s, flags, object);
+ }
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+
+ /* Clear memory outside IRQ disabled fastpath loop */
+ if (unlikely(flags & __GFP_ZERO)) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ memset(p[j], 0, s->object_size);
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = cache_kset(s);
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
if (err)
- goto out_put_kobj;
+ goto out;
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
@@ -5208,8 +5312,6 @@ out:
return err;
out_del_kobj:
kobject_del(&s->kobj);
-out_put_kobj:
- kobject_put(&s->kobj);
goto out;
}
diff --git a/mm/swap.c b/mm/swap.c
index a3a0a2f1f7c3..04b6ce51bcf0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -44,6 +44,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
+ if (page_is_idle(page))
+ clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);
@@ -710,7 +713,8 @@ void lru_cache_add_active_or_unevictable(struct page *page,
{
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == 0) ||
+ (vma->vm_flags & VM_SPECIAL)) {
SetPageActive(page);
lru_cache_add(page);
return;
@@ -796,6 +800,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -822,6 +844,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
@@ -851,6 +877,26 @@ void deactivate_file_page(struct page *page)
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
@@ -880,6 +926,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d783872d746c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
return page;
}
-/*
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated)
{
struct page *found_page, *new_page = NULL;
+ struct address_space *swapper_space = swap_address_space(entry);
int err;
+ *new_page_allocated = false;
do {
/*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swap_address_space(entry),
- entry.val);
+ found_page = find_get_page(swapper_space, entry.val);
if (found_page)
break;
@@ -357,7 +353,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -366,12 +362,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Initiate read into locked page and return.
*/
lru_cache_add_anon(new_page);
- swap_readpage(new_page);
+ *new_page_allocated = true;
return new_page;
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool page_was_allocated;
+ struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+ vma, addr, &page_was_allocated);
+
+ if (page_was_allocated)
+ swap_readpage(retpage);
+
+ return retpage;
+}
+
static unsigned long swapin_nr_pages(unsigned long offset)
{
static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 41e4581af7c5..357206912cd8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..77fee9325a57
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,308 @@
+/*
+ * mm/userfaultfd.c
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct mem_cgroup *memcg;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+ if (!page)
+ goto out;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ ret = -ENOMEM;
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ goto out_release;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ inc_mm_counter(dst_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+out_release:
+ page_cache_release(page);
+ goto out;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, address);
+ pud = pud_alloc(mm, pgd, address);
+ if (pud)
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ pmd = pmd_alloc(mm, pud, address);
+ return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ struct vm_area_struct *dst_vma;
+ ssize_t err;
+ pmd_t *dst_pmd;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(dst_start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(src_start + len <= src_start);
+ BUG_ON(dst_start + len <= dst_start);
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+retry:
+ down_read(&dst_mm->mmap_sem);
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -EINVAL;
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ /*
+ * Be strict and only allow __mcopy_atomic on userfaultfd
+ * registered ranges to prevent userland errors going
+ * unnoticed. As far as the VM consistency is concerned, it
+ * would be perfectly safe to remove this check, but there's
+ * no useful usage for __mcopy_atomic ouside of userfaultfd
+ * registered ranges. This is after all why these are ioctls
+ * belonging to the userfaultfd and not syscalls.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ /*
+ * FIXME: only allow copying on anonymous vmas, tmpfs should
+ * be added.
+ */
+ if (dst_vma->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Ensure the dst_vma has a anon_vma or this page
+ * would get a NULL anon_vma when moved in the
+ * dst_vma.
+ */
+ err = -ENOMEM;
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+
+ while (src_addr < src_start + len) {
+ pmd_t dst_pmdval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+
+ dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+ if (unlikely(!dst_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ dst_pmdval = pmd_read_atomic(dst_pmd);
+ /*
+ * If the dst_pmd is mapped as THP don't
+ * override it and just be strict.
+ */
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+ dst_addr))) {
+ err = -ENOMEM;
+ break;
+ }
+ /* If an huge pmd materialized from under us fail */
+ if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ err = -EFAULT;
+ break;
+ }
+
+ BUG_ON(pmd_none(*dst_pmd));
+ BUG_ON(pmd_trans_huge(*dst_pmd));
+
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, &page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr);
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ void *page_kaddr;
+
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ page_kaddr = kmap(page);
+ err = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap(page);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += PAGE_SIZE;
+ src_addr += PAGE_SIZE;
+ copied += PAGE_SIZE;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page)
+ page_cache_release(page);
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
diff --git a/mm/util.c b/mm/util.c
index 68ff8a5361e7..c7434060039b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e61445dce04e..4a860f0e37d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,7 +36,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
+#include <linux/srcu.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -146,8 +146,9 @@ int vm_swappiness = 60;
*/
unsigned long vm_total_pages;
+DEFINE_STATIC_SRCU(shrinker_srcu);
static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+static DEFINE_SPINLOCK(shrinker_list_lock);
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
@@ -175,7 +176,7 @@ static bool sane_reclaim(struct scan_control *sc)
if (!memcg)
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
- if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ if (memcg->css.cgroup)
return true;
#endif
return false;
@@ -242,9 +243,9 @@ int register_shrinker(struct shrinker *shrinker)
if (!shrinker->nr_deferred)
return -ENOMEM;
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
+ spin_lock(&shrinker_list_lock);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
+ spin_unlock(&shrinker_list_lock);
return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -254,9 +255,14 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
- down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
- up_write(&shrinker_rwsem);
+ spin_lock(&shrinker_list_lock);
+ list_del_rcu(&shrinker->list);
+ spin_unlock(&shrinker_list_lock);
+ /*
+ * Before freeing nr_deferred, ensure all srcu
+ * readers are done with their critical region.
+ */
+ synchronize_srcu(&shrinker_srcu);
kfree(shrinker->nr_deferred);
}
EXPORT_SYMBOL(unregister_shrinker);
@@ -408,6 +414,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
unsigned long nr_scanned,
unsigned long nr_eligible)
{
+ int idx;
struct shrinker *shrinker;
unsigned long freed = 0;
@@ -417,18 +424,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (nr_scanned == 0)
nr_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem)) {
- /*
- * If we would return 0, our callers would understand that we
- * have nothing else to shrink and give up trying. By returning
- * 1 we keep it going and assume we'll be able to shrink next
- * time.
- */
- freed = 1;
- goto out;
- }
+ idx = srcu_read_lock(&shrinker_srcu);
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -444,8 +442,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
- up_read(&shrinker_rwsem);
-out:
+ srcu_read_unlock(&shrinker_srcu, idx);
cond_resched();
return freed;
}
@@ -791,20 +788,24 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct scan_control *sc)
+ struct scan_control *sc,
+ bool *freeable)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
+ int pte_dirty;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
+ &vm_flags, &pte_dirty);
referenced_page = TestClearPageReferenced(page);
/*
* Mlock lost the isolation race with us. Let try_to_unmap()
* move the page to the unevictable list.
*/
- if (vm_flags & VM_LOCKED)
+ if (vm_flags & (VM_LOCKED | VM_LOCKONFAULT))
return PAGEREF_RECLAIM;
if (referenced_ptes) {
@@ -838,6 +839,10 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_KEEP;
}
+ if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) &&
+ !PageDirty(page))
+ *freeable = true;
+
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
@@ -906,6 +911,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool freeable = false;
cond_resched();
@@ -974,21 +980,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*
* 2) Global or new memcg reclaim encounters a page that is
* not marked for immediate reclaim or the caller does not
- * have __GFP_IO. In this case mark the page for immediate
+ * have __GFP_FS. In this case mark the page for immediate
* reclaim and continue scanning.
*
- * __GFP_IO is checked because a loop driver thread might
- * enter reclaim, and deadlock if it waits on a page for
- * which it is needed to do the write (loop masks off
+ * Require __GFP_FS even though we are not entering fs
+ * because we are waiting for a fs activity and we might
+ * be in the middle of the writeout. Moreover a loop driver
+ * might enter reclaim, and deadlock of it waits on a page
+ * for which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * Don't require __GFP_FS, since we're not going into the
- * FS, just waiting on its writeback completion. Worryingly,
- * ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
- * may_enter_fs here is liable to OOM on them.
- *
* 3) Legacy memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
@@ -1005,7 +1007,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 2 above */
} else if (sane_reclaim(sc) ||
- !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ !PageReclaim(page) || !(sc->gfp_mask & __GFP_FS)) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then
@@ -1022,14 +1024,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
- /* Case 3 above */
- } else {
- wait_on_page_writeback(page);
}
+
+ /* Case 3 above */
+ wait_on_page_writeback(page);
}
if (!force_reclaim)
- references = page_check_references(page, sc);
+ references = page_check_references(page, sc,
+ &freeable);
switch (references) {
case PAGEREF_ACTIVATE:
@@ -1046,22 +1049,31 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (!add_to_swap(page, page_list))
- goto activate_locked;
- may_enter_fs = 1;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ if (!freeable) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (!add_to_swap(page, page_list))
+ goto activate_locked;
+ may_enter_fs = 1;
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ } else {
+ if (likely(!PageTransHuge(page)))
+ goto unmap;
+ /* try_to_unmap isn't aware of THP page */
+ if (unlikely(split_huge_page_to_list(page,
+ page_list)))
+ goto keep_locked;
+ }
}
-
+unmap:
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ if (page_mapped(page) && (mapping || freeable)) {
+ switch (try_to_unmap(page, freeable ?
+ TTU_FREE : ttu_flags|TTU_BATCH_FLUSH)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1069,7 +1081,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case SWAP_MLOCK:
goto cull_mlocked;
case SWAP_SUCCESS:
- ; /* try to free the page below */
+ /* try to free the page below */
+ if (!freeable)
+ break;
+ /*
+ * Freeable anon page doesn't have mapping
+ * due to skipping of swapcache so we free
+ * page in here rather than __remove_mapping.
+ */
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!page_freeze_refs(page, 1))
+ goto keep_locked;
+ __ClearPageLocked(page);
+ count_vm_event(PGLAZYFREED);
+ goto free_it;
}
}
@@ -1101,7 +1126,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!sc->may_writepage)
goto keep_locked;
- /* Page is dirty, try to write it out here */
+ /*
+ * Page is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after IO
+ * starts and then write it out here.
+ */
+ try_to_unmap_flush_dirty();
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
goto keep_locked;
@@ -1179,7 +1209,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1212,6 +1242,7 @@ keep:
}
mem_cgroup_uncharge_list(&free_pages);
+ try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
@@ -1438,6 +1469,32 @@ int isolate_lru_page(struct page *page)
return ret;
}
+static int __too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc, int safe)
+{
+ unsigned long inactive, isolated;
+
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_ANON + file);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
@@ -1446,33 +1503,24 @@ int isolate_lru_page(struct page *page)
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
- struct scan_control *sc)
+ struct scan_control *sc)
{
- unsigned long inactive, isolated;
-
if (current_is_kswapd())
return 0;
if (!sane_reclaim(sc))
return 0;
- if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
- } else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
- }
-
/*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
+ * __too_many_isolated(safe=0) is fast but inaccurate, because it
+ * doesn't account for the vm_stat_diff[] counters. So if it looks
+ * like too_many_isolated() is about to return true, fall back to the
+ * slower, more accurate zone_page_state_snapshot().
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
- inactive >>= 3;
+ if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+ return __too_many_isolated(zone, file, sc, 1);
- return isolated > inactive;
+ return 0;
}
static noinline_for_stack void
@@ -1809,7 +1857,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags, NULL)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
@@ -2155,6 +2203,23 @@ out:
}
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+ /*
+ * This deliberately does not clear the cpumask as it's expensive
+ * and unnecessary. If there happens to be data in there then the
+ * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+ * then will be cleared.
+ */
+ current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
@@ -2189,6 +2254,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
+ init_tlb_ubc();
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f5cd974e11a..1fd0886a389f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -759,6 +759,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
struct list_head buddied;
struct list_head lru;
u64 pages_nr;
- struct zbud_ops *ops;
+ const struct zbud_ops *ops;
#ifdef CONFIG_ZPOOL
struct zpool *zpool;
- struct zpool_ops *zpool_ops;
+ const struct zpool_ops *zpool_ops;
#endif
};
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
return -ENOENT;
}
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..951db32b833f 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
struct zpool_driver *driver;
void *pool;
- struct zpool_ops *ops;
+ const struct zpool_ops *ops;
struct list_head list;
};
@@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
* Returns: New zpool on success, NULL on failure.
*/
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
- struct zpool_ops *ops)
+ const struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81aa2249..09aedd9a89e4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@ enum zs_stat_type {
NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static struct dentry *zs_stat_root;
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
#endif
/*
@@ -201,6 +199,8 @@ static int zs_size_classes;
static const int fullness_threshold_frac = 4;
struct size_class {
+ spinlock_t lock;
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
- bool huge;
-
-#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
-#endif
-
- spinlock_t lock;
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
};
/*
@@ -251,6 +245,15 @@ struct zs_pool {
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+ struct zs_pool_stats stats;
+
+ /* Compact classes */
+ struct shrinker shrinker;
+ /*
+ * To signify that register_shrinker() was successful
+ * and unregister_shrinker() will not Oops.
+ */
+ bool shrinker_enabled;
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -309,7 +312,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
return zs_create_pool(name, gfp);
@@ -441,8 +445,6 @@ static int get_size_class_index(int size)
return min(zs_size_classes - 1, idx);
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
@@ -461,6 +463,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
return class->stats.objs[type];
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
static int __init zs_stat_init(void)
{
if (!debugfs_initialized())
@@ -576,23 +580,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
}
#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
static int __init zs_stat_init(void)
{
return 0;
@@ -610,7 +597,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
{
}
-
#endif
@@ -658,13 +644,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
- head = &class->fullness_list[fullness];
- if (*head)
- list_add_tail(&page->lru, &(*head)->lru);
-
- *head = page;
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
+ head = &class->fullness_list[fullness];
+ if (!*head) {
+ *head = page;
+ return;
+ }
+
+ /*
+ * We want to see more ZS_FULL pages and less almost
+ * empty/full. Put pages with higher ->inuse first.
+ */
+ list_add_tail(&page->lru, &(*head)->lru);
+ if (page->inuse >= (*head)->inuse)
+ *head = page;
}
/*
@@ -1495,7 +1490,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
struct size_class *class)
{
struct page *s_page, *d_page;
@@ -1602,8 +1597,6 @@ struct zs_compact_control {
/* Starting object index within @s_page which used for live object
* in the subpage. */
int index;
- /* how many of objects are migrated */
- int nr_migrated;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1607,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
unsigned long index = cc->index;
- int nr_migrated = 0;
int ret = 0;
while (1) {
@@ -1636,23 +1628,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(used_obj, free_obj, class);
+ zs_object_copy(free_obj, used_obj, class);
index++;
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);
- nr_migrated++;
}
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->index = index;
- cc->nr_migrated = nr_migrated;
return ret;
}
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
{
int i;
struct page *page;
@@ -1668,8 +1658,17 @@ static struct page *alloc_target_page(struct size_class *class)
return page;
}
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
- struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ struct page *first_page)
{
enum fullness_group fullness;
@@ -1687,50 +1686,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
free_zspage(first_page);
}
+
+ return fullness;
}
static struct page *isolate_source_page(struct size_class *class)
{
- struct page *page;
+ int i;
+ struct page *page = NULL;
- page = class->fullness_list[ZS_ALMOST_EMPTY];
- if (page)
- remove_zspage(page, class, ZS_ALMOST_EMPTY);
+ for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+ page = class->fullness_list[i];
+ if (!page)
+ continue;
+
+ remove_zspage(page, class, i);
+ break;
+ }
return page;
}
-static unsigned long __zs_compact(struct zs_pool *pool,
- struct size_class *class)
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+ unsigned long obj_wasted;
+
+ obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+ zs_stat_get(class, OBJ_USED);
+
+ obj_wasted /= get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+
+ return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
- int nr_to_migrate;
struct zs_compact_control cc;
struct page *src_page;
struct page *dst_page = NULL;
- unsigned long nr_total_migrated = 0;
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
BUG_ON(!is_first_page(src_page));
- /* The goal is to migrate all live objects in source page */
- nr_to_migrate = src_page->inuse;
+ if (!zs_can_compact(class))
+ break;
+
cc.index = 0;
cc.s_page = src_page;
- while ((dst_page = alloc_target_page(class))) {
+ while ((dst_page = isolate_target_page(class))) {
cc.d_page = dst_page;
/*
- * If there is no more space in dst_page, try to
- * allocate another zspage.
+ * If there is no more space in dst_page, resched
+ * and see if anyone had allocated another zspage.
*/
if (!migrate_zspage(pool, class, &cc))
break;
putback_zspage(pool, class, dst_page);
- nr_total_migrated += cc.nr_migrated;
- nr_to_migrate -= cc.nr_migrated;
}
/* Stop if we couldn't find slot */
@@ -1738,9 +1759,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
break;
putback_zspage(pool, class, dst_page);
- putback_zspage(pool, class, src_page);
+ if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ pool->stats.pages_compacted += class->pages_per_zspage;
spin_unlock(&class->lock);
- nr_total_migrated += cc.nr_migrated;
cond_resched();
spin_lock(&class->lock);
}
@@ -1749,14 +1770,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(pool, class, src_page);
spin_unlock(&class->lock);
-
- return nr_total_migrated;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
- unsigned long nr_migrated = 0;
struct size_class *class;
for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1783,80 @@ unsigned long zs_compact(struct zs_pool *pool)
continue;
if (class->index != i)
continue;
- nr_migrated += __zs_compact(pool, class);
+ __zs_compact(pool, class);
}
- return nr_migrated;
+ return pool->stats.pages_compacted;
}
EXPORT_SYMBOL_GPL(zs_compact);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+ memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_freed;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ pages_freed = pool->stats.pages_compacted;
+ /*
+ * Compact classes and calculate compaction delta.
+ * Can run concurrently with a manually triggered
+ * (by user) compaction.
+ */
+ pages_freed = zs_compact(pool) - pages_freed;
+
+ return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_to_free = 0;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ if (!pool->shrinker_enabled)
+ return 0;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+
+ pages_to_free += zs_can_compact(class);
+ }
+
+ return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+ if (pool->shrinker_enabled) {
+ unregister_shrinker(&pool->shrinker);
+ pool->shrinker_enabled = false;
+ }
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+ pool->shrinker.scan_objects = zs_shrinker_scan;
+ pool->shrinker.count_objects = zs_shrinker_count;
+ pool->shrinker.batch = 0;
+ pool->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&pool->shrinker);
+}
+
/**
* zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1942,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (zs_pool_stat_create(name, pool))
goto err;
+ /*
+ * Not critical, we still can use the pool
+ * and user can trigger compaction manually.
+ */
+ if (zs_register_shrinker(pool) == 0)
+ pool->shrinker_enabled = true;
return pool;
err:
@@ -1869,6 +1960,7 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_unregister_shrinker(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..48a1d081e2a5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
- int err;
+ bool page_was_allocated;
- *retpage = NULL;
- do {
- /*
- * First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics.
- */
- found_page = find_get_page(swapper_space, entry.val);
- if (found_page)
- break;
-
- /*
- * Get a new page to read into from swap.
- */
- if (!new_page) {
- new_page = alloc_page(GFP_KERNEL);
- if (!new_page)
- break; /* Out of memory */
- }
-
- /*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_preload(GFP_KERNEL);
- if (err)
- break;
-
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
- radix_tree_preload_end();
- continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
- break;
- }
-
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
- SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
- if (likely(!err)) {
- radix_tree_preload_end();
- lru_cache_add_anon(new_page);
- *retpage = new_page;
- return ZSWAP_SWAPCACHE_NEW;
- }
- radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- swapcache_free(entry);
- } while (err != -ENOMEM);
-
- if (new_page)
- page_cache_release(new_page);
- if (!found_page)
+ *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+ NULL, 0, &page_was_allocated);
+ if (page_was_allocated)
+ return ZSWAP_SWAPCACHE_NEW;
+ if (!*retpage)
return ZSWAP_SWAPCACHE_FAIL;
- *retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_trees[type] = NULL;
}
-static struct zpool_ops zswap_zpool_ops = {
+static const struct zpool_ops zswap_zpool_ops = {
.evict = zswap_writeback_entry
};
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
ret = atomic_dec_and_test(&task->tk_count);
if (waitqueue_active(wq))
- __wake_up_locked_key(wq, TASK_NORMAL, &k);
+ __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
spin_unlock_irqrestore(&wq->lock, flags);
return ret;
}
diff --git a/scripts/Lindent b/scripts/Lindent
index 9c4b3e2b7098..6d889de4e70b 100755
--- a/scripts/Lindent
+++ b/scripts/Lindent
@@ -1,6 +1,9 @@
#!/bin/sh
PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
RES=`indent --version`
+if [ "$RES" = "" ]; then
+ exit 1
+fi
V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a51ca0e5beef..34ca40097d84 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -584,7 +584,7 @@ our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
our $declaration_macros = qr{(?x:
- (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(|
+ (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
(?:$Storage\s+)?LIST_HEAD\s*\(|
(?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
)};
@@ -2166,7 +2166,11 @@ sub process {
if ($showfile) {
$prefix = "$realfile:$realline: "
} elsif ($emacs) {
- $prefix = "$filename:$linenr: ";
+ if ($file) {
+ $prefix = "$filename:$realline: ";
+ } else {
+ $prefix = "$filename:$linenr: ";
+ }
}
if ($found_file) {
@@ -2317,9 +2321,11 @@ sub process {
}
# Check for git id commit length and improperly formed commit descriptions
- if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) {
- my $init_char = $1;
- my $orig_commit = lc($2);
+ if ($in_commit_log &&
+ ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
+ $line =~ /\b[0-9a-f]{12,40}\b/i)) {
+ my $init_char = "c";
+ my $orig_commit = "";
my $short = 1;
my $long = 0;
my $case = 1;
@@ -2330,6 +2336,13 @@ sub process {
my $orig_desc = "commit description";
my $description = "";
+ if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
+ $init_char = $1;
+ $orig_commit = lc($2);
+ } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
+ $orig_commit = lc($1);
+ }
+
$short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
$long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
$space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2751,8 @@ sub process {
}
}
+# Block comment styles
+# Networking with an initial /*
if ($realfile =~ m@^(drivers/net/|net/)@ &&
$prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
$rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2761,23 @@ sub process {
"networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
}
- if ($realfile =~ m@^(drivers/net/|net/)@ &&
- $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /*
+# Block comments use * on subsequent lines
+ if ($prevline =~ /$;[ \t]*$/ && #ends in comment
+ $prevrawline =~ /^\+.*?\/\*/ && #starting /*
$prevrawline !~ /\*\/[ \t]*$/ && #no trailing */
$rawline =~ /^\+/ && #line is new
$rawline !~ /^\+[ \t]*\*/) { #no leading *
- WARN("NETWORKING_BLOCK_COMMENT_STYLE",
- "networking block comments start with * on subsequent lines\n" . $hereprev);
+ WARN("BLOCK_COMMENT_STYLE",
+ "Block comments use * on subsequent lines\n" . $hereprev);
}
- if ($realfile =~ m@^(drivers/net/|net/)@ &&
- $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */
+# Block comments use */ on trailing lines
+ if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */
$rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ && #inline /*...*/
$rawline !~ m@^\+.*\*{2,}/[ \t]*$@ && #trailing **/
$rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) { #non blank */
- WARN("NETWORKING_BLOCK_COMMENT_STYLE",
- "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+ WARN("BLOCK_COMMENT_STYLE",
+ "Block comments use a trailing */ on a separate line\n" . $herecurr);
}
# check for missing blank lines after struct/union declarations
@@ -3067,15 +3083,22 @@ sub process {
substr($s, 0, length($c), '');
- # Make sure we remove the line prefixes as we have
- # none on the first line, and are going to readd them
- # where necessary.
- $s =~ s/\n./\n/gs;
+ # remove inline comments
+ $s =~ s/$;/ /g;
+ $c =~ s/$;/ /g;
# Find out how long the conditional actually is.
my @newlines = ($c =~ /\n/gs);
my $cond_lines = 1 + $#newlines;
+ # Make sure we remove the line prefixes as we have
+ # none on the first line, and are going to readd them
+ # where necessary.
+ $s =~ s/\n./\n/gs;
+ while ($s =~ /\n\s+\\\n/) {
+ $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
+ }
+
# We want to check the first line inside the block
# starting at the end of the conditional, so remove:
# 1) any blank line termination
@@ -3141,8 +3164,10 @@ sub process {
#print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
- if ($check && (($sindent % 8) != 0 ||
- ($sindent <= $indent && $s ne ''))) {
+ if ($check && $s ne '' &&
+ (($sindent % 8) != 0 ||
+ ($sindent < $indent) ||
+ ($sindent > $indent + 8))) {
WARN("SUSPECT_CODE_INDENT",
"suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
}
@@ -3439,13 +3464,15 @@ sub process {
}
}
-# # no BUG() or BUG_ON()
-# if ($line =~ /\b(BUG|BUG_ON)\b/) {
-# print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n";
-# print "$herecurr";
-# $clean = 0;
-# }
+# avoid BUG() or BUG_ON()
+ if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+ my $msg_type = \&WARN;
+ $msg_type = \&CHK if ($file);
+ &{$msg_type}("AVOID_BUG",
+ "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+ }
+# avoid LINUX_VERSION_CODE
if ($line =~ /\bLINUX_VERSION_CODE\b/) {
WARN("LINUX_VERSION_CODE",
"LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -4816,10 +4843,34 @@ sub process {
# check for needless "if (<foo>) fn(<foo>)" uses
if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
- my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
- if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
- WARN('NEEDLESS_IF',
- "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
+ my $tested = quotemeta($1);
+ my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
+ if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
+ my $func = $1;
+ if (WARN('NEEDLESS_IF',
+ "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
+ $fix) {
+ my $do_fix = 1;
+ my $leading_tabs = "";
+ my $new_leading_tabs = "";
+ if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
+ $leading_tabs = $1;
+ } else {
+ $do_fix = 0;
+ }
+ if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
+ $new_leading_tabs = $1;
+ if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
+ $do_fix = 0;
+ }
+ } else {
+ $do_fix = 0;
+ }
+ if ($do_fix) {
+ fix_delete_line($fixlinenr - 1, $prevrawline);
+ $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
+ }
+ }
}
}
@@ -5517,10 +5568,10 @@ sub process {
"consider using a completion\n" . $herecurr);
}
-# recommend kstrto* over simple_strto* and strict_strto*
- if ($line =~ /\b((simple|strict)_(strto(l|ll|ul|ull)))\s*\(/) {
+# simple_strto*() is deprecated
+ if ($line =~ /\b(simple_strto(l|ll|ul|ull))\s*\(/) {
WARN("CONSIDER_KSTRTO",
- "$1 is obsolete, use k$3 instead\n" . $herecurr);
+ "$1 is obsolete, use parse_integer(), kstrto*(), kstrto*_from_user(), sscanf() instead\n" . $herecurr);
}
# check for __initcall(), use device_initcall() explicitly or more appropriate function please
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 515c4c00e957..00d6d53c2681 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -14,11 +14,14 @@ declare -A cache
parse_symbol() {
# The structure of symbol at this point is:
- # [name]+[offset]/[total length]
+ # ([name]+[offset]/[total length])
#
# For example:
# do_basic_setup+0x9c/0xbf
+ # Remove the englobing parenthesis
+ symbol=${symbol#\(}
+ symbol=${symbol%\)}
# Strip the symbol name so that we could look it up
local name=${symbol%+*}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 0ac1a07874cc..b1a074b409f5 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -432,7 +432,7 @@ sub dump_section {
} else {
# print STDERR "other section '$name' = '$contents'\n";
if (defined($sections{$name}) && ($sections{$name} ne "")) {
- print STDERR "Error(${file}:$.): duplicate section name '$name'\n";
+ print STDERR "${file}:$.: error: duplicate section name '$name'\n";
++$errors;
}
$sections{$name} = $contents;
@@ -1781,7 +1781,7 @@ sub dump_struct($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse struct or union!\n";
+ print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
++$errors;
}
}
@@ -1802,7 +1802,7 @@ sub dump_enum($$) {
push @parameterlist, $arg;
if (!$parameterdescs{$arg}) {
$parameterdescs{$arg} = $undescribed;
- print STDERR "Warning(${file}:$.): Enum value '$arg' ".
+ print STDERR "${file}:$.: warning: Enum value '$arg' ".
"not described in enum '$declaration_name'\n";
}
@@ -1820,7 +1820,7 @@ sub dump_enum($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse enum!\n";
+ print STDERR "${file}:$.: error: Cannot parse enum!\n";
++$errors;
}
}
@@ -1848,7 +1848,7 @@ sub dump_typedef($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse typedef!\n";
+ print STDERR "${file}:$.: error: Cannot parse typedef!\n";
++$errors;
}
}
@@ -1980,11 +1980,11 @@ sub push_parameter($$$) {
$parameterdescs{$param_name} = $undescribed;
if (($type eq 'function') || ($type eq 'enum')) {
- print STDERR "Warning(${file}:$.): Function parameter ".
+ print STDERR "${file}:$.: warning: Function parameter ".
"or member '$param' not " .
"described in '$declaration_name'\n";
}
- print STDERR "Warning(${file}:$.):" .
+ print STDERR "${file}:$.: warning:" .
" No description found for parameter '$param'\n";
++$warnings;
}
@@ -2035,14 +2035,14 @@ sub check_sections($$$$$$) {
}
if ($err) {
if ($decl_type eq "function") {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"Excess function parameter " .
"'$sects[$sx]' " .
"description in '$decl_name'\n";
++$warnings;
} else {
if ($nested !~ m/\Q$sects[$sx]\E/) {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"Excess struct/union/enum/typedef member " .
"'$sects[$sx]' " .
"description in '$decl_name'\n";
@@ -2068,7 +2068,7 @@ sub check_return_section {
if (!defined($sections{$section_return}) ||
$sections{$section_return} eq "") {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"No description found for return value of " .
"'$declaration_name'\n";
++$warnings;
@@ -2147,7 +2147,7 @@ sub dump_function($$) {
create_parameterlist($args, ',', $file);
} else {
- print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n";
+ print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
return;
}
@@ -2211,7 +2211,7 @@ sub tracepoint_munge($) {
$tracepointargs = $1;
}
if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
- print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+ print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
"$prototype\n";
} else {
$prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2410,7 +2410,7 @@ sub process_file($) {
}
if (($declaration_purpose eq "") && $verbose) {
- print STDERR "Warning(${file}:$.): missing initial short description on line:\n";
+ print STDERR "${file}:$.: warning: missing initial short description on line:\n";
print STDERR $_;
++$warnings;
}
@@ -2428,10 +2428,10 @@ sub process_file($) {
}
if ($verbose) {
- print STDERR "Info(${file}:$.): Scanning doc for $identifier\n";
+ print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
}
} else {
- print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.",
+ print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
" - I thought it was a doc line\n";
++$warnings;
$state = 0;
@@ -2443,7 +2443,7 @@ sub process_file($) {
if (($contents ne "") && ($contents ne "\n")) {
if (!$in_doc_sect && $verbose) {
- print STDERR "Warning(${file}:$.): contents before sections\n";
+ print STDERR "${file}:$.: warning: contents before sections\n";
++$warnings;
}
dump_section($file, $section, xml_escape($contents));
@@ -2470,7 +2470,7 @@ sub process_file($) {
}
# look for doc_com + <text> + doc_end:
if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
- print STDERR "Warning(${file}:$.): suspicious ending line: $_";
+ print STDERR "${file}:$.: warning: suspicious ending line: $_";
++$warnings;
}
@@ -2500,7 +2500,7 @@ sub process_file($) {
}
} else {
# i dont know - bad line? ignore.
- print STDERR "Warning(${file}:$.): bad line: $_";
+ print STDERR "${file}:$.: warning: bad line: $_";
++$warnings;
}
} elsif ($state == 3) { # scanning for function '{' (end of prototype)
@@ -2556,7 +2556,7 @@ sub process_file($) {
}
}
if ($initial_section_counter == $section_counter) {
- print STDERR "Warning(${file}): no structured comments found\n";
+ print STDERR "${file}:1: warning: no structured comments found\n";
if (($function_only == 1) && ($show_not_found == 1)) {
print STDERR " Was looking for '$_'.\n" for keys %function_table;
}
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bb8e4d0a1911..4bd8d1a3415f 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -187,6 +187,7 @@ capatibilities||capabilities
carefuly||carefully
cariage||carriage
catagory||category
+cehck||check
challange||challenge
challanges||challenges
chanell||channel
@@ -199,6 +200,8 @@ charactor||character
charater||character
charaters||characters
charcter||character
+chcek||check
+chck||check
checksuming||checksumming
childern||children
childs||children
@@ -1028,6 +1031,7 @@ visiters||visitors
vitual||virtual
wating||waiting
whataver||whatever
+whcih||which
whenver||whenever
wheter||whether
whe||when
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
new->cap_effective = *effective;
new->cap_inheritable = *inheritable;
new->cap_permitted = *permitted;
+
+ /*
+ * Mask off ambient bits that are no longer both permitted and
+ * inheritable.
+ */
+ new->cap_ambient = cap_intersect(new->cap_ambient,
+ cap_intersect(*permitted,
+ *inheritable));
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EINVAL;
return 0;
}
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
/*
* pP' = (X & fP) | (pI & fI)
+ * The addition of pA' is handled later.
*/
new->cap_permitted.cap[i] =
(new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
{
const struct cred *old = current_cred();
struct cred *new = bprm->cred;
- bool effective, has_cap = false;
+ bool effective, has_cap = false, is_setid;
int ret;
kuid_t root_uid;
+ if (WARN_ON(!cap_ambient_invariant_ok(old)))
+ return -EPERM;
+
effective = false;
ret = get_file_caps(bprm, &effective, &has_cap);
if (ret < 0)
@@ -522,8 +536,9 @@ skip:
*
* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
- if ((!uid_eq(new->euid, old->uid) ||
- !gid_eq(new->egid, old->gid) ||
+ is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+ if ((is_setid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) &&
bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
/* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
new->suid = new->fsuid = new->euid;
new->sgid = new->fsgid = new->egid;
+ /* File caps or setid cancels ambient. */
+ if (has_cap || is_setid)
+ cap_clear(new->cap_ambient);
+
+ /*
+ * Now that we've computed pA', update pP' to give:
+ * pP' = (X & fP) | (pI & fI) | pA'
+ */
+ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+ /*
+ * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
+ * this is the same as pE' = (fE ? pP' : 0) | pA'.
+ */
if (effective)
new->cap_effective = new->cap_permitted;
else
- cap_clear(new->cap_effective);
+ new->cap_effective = new->cap_ambient;
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
bprm->cap_effective = effective;
/*
@@ -557,7 +590,7 @@ skip:
* Number 1 above might fail if you don't have a full bset, but I think
* that is interesting information to audit.
*/
- if (!cap_isclear(new->cap_effective)) {
+ if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
!uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
}
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
return 0;
}
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
if (!uid_eq(cred->uid, root_uid)) {
if (bprm->cap_effective)
return 1;
- if (!cap_isclear(cred->cap_permitted))
+ if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
return 1;
}
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
uid_eq(old->suid, root_uid)) &&
(!uid_eq(new->uid, root_uid) &&
!uid_eq(new->euid, root_uid) &&
- !uid_eq(new->suid, root_uid)) &&
- !issecure(SECURE_KEEP_CAPS)) {
- cap_clear(new->cap_permitted);
- cap_clear(new->cap_effective);
+ !uid_eq(new->suid, root_uid))) {
+ if (!issecure(SECURE_KEEP_CAPS)) {
+ cap_clear(new->cap_permitted);
+ cap_clear(new->cap_effective);
+ }
+
+ /*
+ * Pre-ambient programs expect setresuid to nonroot followed
+ * by exec to drop capabilities. We should make sure that
+ * this remains the case.
+ */
+ cap_clear(new->cap_ambient);
}
if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
cap_clear(new->cap_effective);
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
return commit_creds(new);
+ case PR_CAP_AMBIENT:
+ if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+ if (arg3 | arg4 | arg5)
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ cap_clear(new->cap_ambient);
+ return commit_creds(new);
+ }
+
+ if (((!cap_valid(arg3)) | arg4 | arg5))
+ return -EINVAL;
+
+ if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+ return !!cap_raised(current_cred()->cap_ambient, arg3);
+ } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+ arg2 != PR_CAP_AMBIENT_LOWER) {
+ return -EINVAL;
+ } else {
+ if (arg2 == PR_CAP_AMBIENT_RAISE &&
+ (!cap_raised(current_cred()->cap_permitted, arg3) ||
+ !cap_raised(current_cred()->cap_inheritable,
+ arg3) ||
+ issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
+ return -EPERM;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ if (arg2 == PR_CAP_AMBIENT_RAISE)
+ cap_raise(new->cap_ambient, arg3);
+ else
+ cap_lower(new->cap_ambient, arg3);
+ return commit_creds(new);
+ }
+
default:
/* No functionality available - continue with default */
return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
new->cap_inheritable = old->cap_inheritable;
new->cap_permitted = old->cap_permitted;
new->cap_effective = old->cap_effective;
+ new->cap_ambient = old->cap_ambient;
new->cap_bset = old->cap_bset;
new->jit_keyring = old->jit_keyring;
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index a99f7200ff3f..20b66efa26ba 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -1200,7 +1200,7 @@ static void snd_mixer_oss_proc_write(struct snd_info_entry *entry,
continue;
}
snd_info_get_str(idxstr, cptr, sizeof(idxstr));
- idx = simple_strtoul(idxstr, NULL, 10);
+ parse_integer(idxstr, 10, (unsigned int *)&idx);
if (idx >= 0x4000) { /* too big */
pr_err("ALSA: mixer_oss: invalid index %d\n", idx);
continue;
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 58550cc93f28..a214fec0addc 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2892,9 +2892,9 @@ static void snd_pcm_oss_proc_write(struct snd_info_entry *entry,
}
}
ptr = snd_info_get_str(str, ptr, sizeof(str));
- template.periods = simple_strtoul(str, NULL, 10);
+ parse_integer(str, 10, &template.periods);
ptr = snd_info_get_str(str, ptr, sizeof(str));
- template.period_size = simple_strtoul(str, NULL, 10);
+ parse_integer(str, 10, &template.period_size);
for (idx1 = 31; idx1 >= 0; idx1--)
if (template.period_size & (1 << idx1))
break;
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 02bd96954dc4..b563617de311 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -507,7 +507,7 @@ static void snd_pcm_xrun_debug_write(struct snd_info_entry *entry,
struct snd_pcm_str *pstr = entry->private_data;
char line[64];
if (!snd_info_get_line(buffer, line, sizeof(line)))
- pstr->xrun_debug = simple_strtoul(line, NULL, 10);
+ parse_integer(line, 10, &pstr->xrun_debug);
}
#endif
diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c
index b45f6aa32264..84186dc5eb0f 100644
--- a/sound/core/pcm_memory.c
+++ b/sound/core/pcm_memory.c
@@ -167,7 +167,8 @@ static void snd_pcm_lib_preallocate_proc_write(struct snd_info_entry *entry,
}
if (!snd_info_get_line(buffer, line, sizeof(line))) {
snd_info_get_str(str, line, sizeof(str));
- size = simple_strtoul(str, NULL, 10) * 1024;
+ parse_integer(str, 10, &size);
+ size *= 1024;
if ((size != 0 && size < 8192) || size > substream->dma_max) {
buffer->error = -EINVAL;
return;
diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c
index 82259ca61e64..3c9dfea1489a 100644
--- a/sound/pci/ac97/ac97_codec.c
+++ b/sound/pci/ac97/ac97_codec.c
@@ -2884,8 +2884,12 @@ static int apply_quirk_str(struct snd_ac97 *ac97, const char *typestr)
return apply_quirk(ac97, i);
}
/* for compatibility, accept the numbers, too */
- if (*typestr >= '0' && *typestr <= '9')
- return apply_quirk(ac97, (int)simple_strtoul(typestr, NULL, 10));
+ if (*typestr >= '0' && *typestr <= '9') {
+ int type;
+
+ parse_integer(typestr, 10, &type);
+ return apply_quirk(ac97, type);
+ }
return -EINVAL;
}
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 1b63a03a1f57..c81aec9c872a 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -250,7 +250,7 @@ static ssize_t codec_reg_write_file(struct file *file,
char buf[32];
size_t buf_size;
char *start = buf;
- unsigned long reg, value;
+ unsigned int reg, value;
struct snd_soc_codec *codec = file->private_data;
int ret;
@@ -261,10 +261,13 @@ static ssize_t codec_reg_write_file(struct file *file,
while (*start == ' ')
start++;
- reg = simple_strtoul(start, &start, 16);
+ ret = parse_integer(start, 16, &reg);
+ if (ret < 0)
+ return ret;
+ start += ret;
while (*start == ' ')
start++;
- ret = kstrtoul(start, 16, &value);
+ ret = kstrtouint(start, 16, &value);
if (ret)
return ret;
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..c9fcd312154c 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -5,13 +5,19 @@ BINARIES = compaction_test
BINARIES += hugepage-mmap
BINARIES += hugepage-shm
BINARIES += hugetlbfstest
+BINARIES += lock-on-fault
BINARIES += map_hugetlb
+BINARIES += mlock2-tests
+BINARIES += on-fault-limit
BINARIES += thuge-gen
BINARIES += transhuge-stress
+BINARIES += userfaultfd
all: $(BINARIES)
%: %.c
$(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+ $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
TEST_PROGS := run_vmtests
TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/lock-on-fault.c b/tools/testing/selftests/vm/lock-on-fault.c
new file mode 100644
index 000000000000..f02c9fbdfed6
--- /dev/null
+++ b/tools/testing/selftests/vm/lock-on-fault.c
@@ -0,0 +1,344 @@
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ char *end_addr;
+ char *stop;
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ end_addr = strchr(line, '-');
+ if (!end_addr) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ *end_addr = '\0';
+ end_addr++;
+ stop = strchr(end_addr, ' ');
+ if (!stop) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ stop = '\0';
+
+ sscanf(line, "%lx", &start);
+ sscanf(end_addr, "%lx", &end);
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+static unsigned long get_pageflags(unsigned long addr)
+{
+ FILE *file;
+ unsigned long pfn;
+ unsigned long offset;
+
+ file = fopen("/proc/self/pagemap", "r");
+ if (!file) {
+ perror("fopen");
+ _exit(1);
+ }
+
+ offset = addr / getpagesize() * sizeof(unsigned long);
+ if (fseek(file, offset, SEEK_SET)) {
+ perror("fseek");
+ _exit(1);
+ }
+
+ if (fread(&pfn, sizeof(unsigned long), 1, file) != 1) {
+ perror("fread");
+ _exit(1);
+ }
+
+ fclose(file);
+ return pfn;
+}
+
+static unsigned long get_kpageflags(unsigned long pfn)
+{
+ unsigned long flags;
+ FILE *file;
+
+ file = fopen("/proc/kpageflags", "r");
+ if (!file) {
+ perror("fopen");
+ _exit(1);
+ }
+
+ if (fseek(file, pfn * sizeof(unsigned long), SEEK_SET)) {
+ perror("fseek");
+ _exit(1);
+ }
+
+ if (fread(&flags, sizeof(unsigned long), 1, file) != 1) {
+ perror("fread");
+ _exit(1);
+ }
+
+ fclose(file);
+ return flags;
+}
+
+#define PRESENT_BIT 0x8000000000000000
+#define PFN_MASK 0x007FFFFFFFFFFFFF
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int test_mmap(int flags)
+{
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ void *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, flags, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return 1;
+ }
+
+ /* Write something into the first page to ensure it is present */
+ *(char *)map = 1;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* page2_flags should not be present */
+ if (page2_flags & PRESENT_BIT) {
+ printf("page map says 0x%lx\n", page2_flags);
+ printf("present is 0x%lx\n", PRESENT_BIT);
+ return 1;
+ }
+
+ /* page1_flags should be present */
+ if ((page1_flags & PRESENT_BIT) == 0) {
+ printf("page map says 0x%lx\n", page1_flags);
+ printf("present is 0x%lx\n", PRESENT_BIT);
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ /* page1_flags now contains the entry from kpageflags for the first
+ * page, the unevictable bit should be set */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("kpageflags says 0x%lx\n", page1_flags);
+ printf("unevictable is 0x%lx\n", UNEVICTABLE_BIT);
+ return 1;
+ }
+
+ munmap(map, 2 * page_size);
+ return 0;
+}
+
+static int test_munlock(int flags)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page3_flags;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, flags, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page3_flags = get_pageflags((unsigned long)map + page_size * 2);
+
+ /* No pages should be present */
+ if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT) ||
+ (page3_flags & PRESENT_BIT)) {
+ printf("Page was made present by munlock()\n");
+ goto out;
+ }
+
+ /* Write something to each page so that they are faulted in */
+ *(char*)map = 1;
+ *(char*)(map + page_size) = 1;
+ *(char*)(map + page_size * 2) = 1;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page3_flags = get_pageflags((unsigned long)map + page_size * 2);
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+ page3_flags = get_kpageflags(page3_flags & PFN_MASK);
+
+ /* Pages 1 and 3 should be unevictable */
+ if (!(page1_flags & UNEVICTABLE_BIT)) {
+ printf("Missing unevictable bit on lock on fault page1\n");
+ goto out;
+ }
+ if (!(page3_flags & UNEVICTABLE_BIT)) {
+ printf("Missing unevictable bit on lock on fault page3\n");
+ goto out;
+ }
+
+ /* Page 2 should not be unevictable */
+ if (page2_flags & UNEVICTABLE_BIT) {
+ printf("Unlocked page is still marked unevictable\n");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+static int test_vma_management(int flags)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, flags, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("VMAs are not merged to start, aborting test\n");
+ ret = 0;
+ goto out;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ printf("failed to split VMA for munlock\n");
+ goto out;
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("failed to merge VMAs after munlock\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_mlockall(int (test_function)(int flags))
+{
+ int ret = 1;
+
+ if (mlockall(MCL_ONFAULT)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ ret = test_function(MAP_PRIVATE | MAP_ANONYMOUS);
+ munlockall();
+ return ret;
+}
+
+#ifndef MAP_LOCKONFAULT
+#define MAP_LOCKONFAULT (MAP_HUGETLB << 1)
+#endif
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+ ret += test_mmap(MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKONFAULT);
+ ret += test_mlockall(test_mmap);
+ ret += test_munlock(MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKONFAULT);
+ ret += test_mlockall(test_munlock);
+ ret += test_vma_management(MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKONFAULT);
+ ret += test_mlockall(test_vma_management);
+ return ret;
+}
+
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000000..26298238adb4
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,617 @@
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#ifndef MLOCK_LOCK
+#define MLOCK_LOCK 1
+#endif
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 2
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static int munlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_munlock2
+ return syscall(__NR_munlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static int munlockall2_(int flags)
+{
+#ifdef __NR_munlockall2
+ return syscall(__NR_munlockall2, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static unsigned long get_pageflags(unsigned long addr)
+{
+ FILE *file;
+ unsigned long pfn;
+ unsigned long offset;
+
+ file = fopen("/proc/self/pagemap", "r");
+ if (!file) {
+ perror("fopen pagemap");
+ _exit(1);
+ }
+
+ offset = addr / getpagesize() * sizeof(unsigned long);
+ if (fseek(file, offset, SEEK_SET)) {
+ perror("fseek pagemap");
+ _exit(1);
+ }
+
+ if (fread(&pfn, sizeof(unsigned long), 1, file) != 1) {
+ perror("fread pagemap");
+ _exit(1);
+ }
+
+ fclose(file);
+ return pfn;
+}
+
+static unsigned long get_kpageflags(unsigned long pfn)
+{
+ unsigned long flags;
+ FILE *file;
+
+ file = fopen("/proc/kpageflags", "r");
+ if (!file) {
+ perror("fopen kpageflags");
+ _exit(1);
+ }
+
+ if (fseek(file, pfn * sizeof(unsigned long), SEEK_SET)) {
+ perror("fseek kpageflags");
+ _exit(1);
+ }
+
+ if (fread(&flags, sizeof(unsigned long), 1, file) != 1) {
+ perror("fread kpageflags");
+ _exit(1);
+ }
+
+ fclose(file);
+ return flags;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool find_flag(FILE *file, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+
+ while (getline(&line, &size, file) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ return ret;
+}
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ bool ret = false;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end) {
+ ret = find_flag(file, vmflag);
+ goto out;
+ }
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+out:
+ free(line);
+ fclose(file);
+ return ret;
+}
+
+#define PRESENT_BIT 0x8000000000000000
+#define PFN_MASK 0x007FFFFFFFFFFFFF
+#define UNEVICTABLE_BIT (1UL << 18)
+
+#define LOCKED "lo"
+#define LOCKEDONFAULT "lf"
+
+static int lock_check(char *map)
+{
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Both pages should be present */
+ if (((page1_flags & PRESENT_BIT) == 0) ||
+ ((page2_flags & PRESENT_BIT) == 0)) {
+ printf("Failed to make both pages present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Both pages should be unevictable */
+ if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
+ ((page2_flags & UNEVICTABLE_BIT) == 0)) {
+ printf("Failed to make both pages unevictable\n");
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKED) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is missing\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_lock_check(char *map)
+{
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) {
+ printf("A page is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map, LOCKED) ||
+ is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is still set after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_lock()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_LOCK)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_LOCK)");
+ goto unmap;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ /* Now clear the MLOCK_LOCK flag and recheck attributes */
+ if (munlock2_(map, 2 * page_size, MLOCK_LOCK)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("munlock2(MLOCK_LOCK)");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int onfault_check(char *map)
+{
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Neither page should be present */
+ if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) {
+ printf("Pages were made present by MLOCK_ONFAULT\n");
+ return 1;
+ }
+
+ *map = 'a';
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Only page 1 should be present */
+ if ((page1_flags & PRESENT_BIT) == 0) {
+ printf("Page 1 is not present after fault\n");
+ return 1;
+ } else if (page2_flags & PRESENT_BIT) {
+ printf("Page 2 was made present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make faulted page unevictable\n");
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is missing\n", LOCKEDONFAULT);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ if (page1_flags & UNEVICTABLE_BIT) {
+ printf("Page 1 is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is still set after unlock\n", LOCKEDONFAULT);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_onfault()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ /* Now clear the MLOCK_ONFAULT flag and recheck attributes */
+ if (munlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("munlock2(MLOCK_LOCK)");
+ goto unmap;
+ }
+
+ ret = unlock_onfault_check(map);
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make present page unevictable\n");
+ goto unmap;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is missing for one of the pages\n", LOCKEDONFAULT);
+ goto unmap;
+ }
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_munlock_mismatch()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_LOCK)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_LOCK)");
+ goto unmap;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Both pages should be present */
+ if (((page1_flags & PRESENT_BIT) == 0) ||
+ ((page2_flags & PRESENT_BIT) == 0)) {
+ printf("Failed to make both pages present\n");
+ goto unmap;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Both pages should be unevictable */
+ if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
+ ((page2_flags & UNEVICTABLE_BIT) == 0)) {
+ printf("Failed to make both pages unevictable\n");
+ goto unmap;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKED) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is missing\n", LOCKED);
+ goto unmap;
+ }
+
+ /* Now clear the MLOCK_ONFAULT flag and recheck attributes */
+ if (munlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("munlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ if ((page1_flags & UNEVICTABLE_BIT) == 0 ||
+ (page2_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Both pages should still be unevictable but are not\n");
+ goto unmap;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKED) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is not set set after unlock\n", LOCKED);
+ goto unmap;
+ }
+
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+
+}
+
+static int test_munlockall()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page1_flags;
+ unsigned long page2_flags;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT)) {
+ perror("mlockall(MCL_CURRENT)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall2_(MCL_CURRENT)) {
+ perror("munlockall2(MCL_CURRENT)");
+ goto unmap;
+ }
+
+ if (unlock_lock_check(map))
+ goto unmap;
+
+ munmap(map, 2 * page_size);
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall second mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_ONFAULT)) {
+ perror("mlockall(MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ if (munlockall2_(MCL_ONFAULT)) {
+ perror("munlockall2(MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (unlock_onfault_check(map))
+ goto unmap;
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall2_(MCL_FUTURE | MCL_ONFAULT)) {
+ perror("munlockall2(MCL_FUTURE | MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ ret = lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ munlockall2_(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT);
+ return ret;
+}
+
+int main(char **argv, int argc)
+{
+ int ret = 0;
+ ret += test_mlock_lock();
+ ret += test_mlock_onfault();
+ ret += test_munlockall();
+ ret += test_munlock_mismatch();
+ ret += test_lock_onfault_of_present();
+ return ret;
+}
+
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000000..ed2a109ea421
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,47 @@
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+ int ret = 1;
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+ perror("getrlimit");
+ return ret;
+ }
+
+ if (mlockall(MCL_ONFAULT)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0);
+ if (map != MAP_FAILED)
+ printf("mmap should have failed, but didn't\n");
+ else {
+ ret = 0;
+ munmap(map, 2 * lims.rlim_max);
+ }
+
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+
+ ret += test_limit();
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..5f906f0e4296 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -86,6 +86,17 @@ else
echo "[PASS]"
fi
+echo "--------------------"
+echo "running userfaultfd"
+echo "--------------------"
+./userfaultfd 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
#cleanup
umount $mnt
rm -rf $mnt
@@ -102,4 +113,37 @@ else
echo "[PASS]"
fi
+echo "--------------------"
+echo "running lock-on-fault"
+echo "--------------------"
+./lock-on-fault
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running on-fault-limit"
+echo "--------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..0c0b83953352
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,636 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include "../../../../include/uapi/linux/userfaultfd.h"
+
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 359
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+ unsigned long i;
+ for (i = 0; i < n; i++)
+ if (str1[i] != str2[i])
+ return 1;
+ return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct random_data rand;
+ unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+ int32_t rand_nr;
+ unsigned long long count;
+ char randstate[64];
+ unsigned int seed;
+ time_t start;
+
+ if (bounces & BOUNCE_RANDOM) {
+ seed = (unsigned int) time(NULL) - bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ seed += cpu;
+ bzero(&rand, sizeof(rand));
+ bzero(&randstate, sizeof(randstate));
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+ fprintf(stderr, "srandom_r error\n"), exit(1);
+ } else {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 1 error\n"), exit(1);
+ page_nr = rand_nr;
+ if (sizeof(page_nr) > sizeof(rand_nr)) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 2 error\n"), exit(1);
+ page_nr |= ((unsigned long) rand_nr) << 32;
+ }
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+
+ start = time(NULL);
+ if (bounces & BOUNCE_VERIFY) {
+ count = *area_count(area_dst, page_nr);
+ if (!count)
+ fprintf(stderr,
+ "page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+
+
+ /*
+ * We can't use bcmp (or memcmp) because that
+ * returns 0 erroneously if the memory is
+ * changing under it (even if the end of the
+ * page is never changing and always
+ * different).
+ */
+#if 1
+ if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size))
+ fprintf(stderr,
+ "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+#else
+ unsigned long loops;
+
+ loops = 0;
+ /* uncomment the below line to test with mutex */
+ /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+ while (!bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ loops += 1;
+ if (loops > 10)
+ break;
+ }
+ /* uncomment below line to test with mutex */
+ /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+ if (loops) {
+ fprintf(stderr,
+ "page_nr %lu all zero thread %lu %p %lu\n",
+ page_nr, cpu, area_dst + page_nr * page_size,
+ loops);
+ if (loops > 10)
+ exit(1);
+ }
+#endif
+ }
+
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr]) {
+ fprintf(stderr,
+ "page_nr %lu memory corruption %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+ }
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+ if (time(NULL) - start > 1)
+ fprintf(stderr,
+ "userfault too slow %ld "
+ "possible false positive with overcommit\n",
+ time(NULL) - start);
+ }
+
+ return NULL;
+}
+
+static int copy_page(unsigned long offset)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ fprintf(stderr, "unexpected offset %lu\n",
+ offset), exit(1);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else if (uffdio_copy.copy != page_size) {
+ fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else
+ return 1;
+ return 0;
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ int ret;
+ unsigned long offset;
+ char tmp_chr;
+ unsigned long userfaults = 0;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (!ret)
+ fprintf(stderr, "poll error %d\n", ret), exit(1);
+ if (ret < 0)
+ perror("poll"), exit(1);
+ if (pollfd[1].revents & POLLIN) {
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ fprintf(stderr, "read pipefd error\n"),
+ exit(1);
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ fprintf(stderr, "pollfd[0].revents %d\n",
+ pollfd[0].revents), exit(1);
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("nonblocking read error"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ userfaults++;
+ }
+ return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ unsigned long *this_cpu_userfaults;
+ struct uffd_msg msg;
+ unsigned long offset;
+ int ret;
+
+ this_cpu_userfaults = (unsigned long *) arg;
+ *this_cpu_userfaults = 0;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (ret < 0)
+ perror("blocking read error"), exit(1);
+ else
+ fprintf(stderr, "short read\n"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (bounces & BOUNCE_VERIFY &&
+ msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ (*this_cpu_userfaults)++;
+ }
+ return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+
+ for (page_nr = cpu * nr_pages_per_cpu;
+ page_nr < (cpu+1) * nr_pages_per_cpu;
+ page_nr++)
+ copy_page(page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+ void **_userfaults = (void **) userfaults;
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread, (void *)cpu))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ &_userfaults[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ return 1;
+ }
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+ fprintf(stderr, "pipefd write error\n");
+ return 1;
+ }
+ if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ struct uffdio_api uffdio_api;
+ unsigned long cpu;
+ int uffd_flags;
+ unsigned long userfaults[nr_cpus];
+
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_src = area;
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_dst = area;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+ return 1;
+ }
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) = (pthread_mutex_t)
+ PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ if (posix_memalign(&area, page_size, page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ while (bounces--) {
+ unsigned long expected_ioctls;
+
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+ expected_ioctls = (1 << _UFFDIO_WAKE) |
+ (1 << _UFFDIO_COPY) |
+ (1 << _UFFDIO_ZEROPAGE);
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ return 1;
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise 2");
+ return 1;
+ }
+
+ /* bounce pass */
+ if (stress(userfaults))
+ return 1;
+
+ /* unregister */
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY) {
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (my_bcmp(area_dst,
+ area_dst + nr * page_size,
+ sizeof(pthread_mutex_t))) {
+ fprintf(stderr,
+ "error mutex 2 %lu\n",
+ nr);
+ bounces = 0;
+ }
+ if (*area_count(area_dst, nr) != count_verify[nr]) {
+ fprintf(stderr,
+ "error area_count %Lu %Lu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr],
+ nr);
+ bounces = 0;
+ }
+ }
+ }
+
+ /* prepare next bounce */
+ tmp_area = area_src;
+ area_src = area_dst;
+ area_dst = tmp_area;
+
+ printf("userfaults:");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ printf(" %lu", userfaults[cpu]);
+ printf("\n");
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 3)
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+ page_size)
+ fprintf(stderr, "Impossible to run this test\n"), exit(2);
+ nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ bounces = atoi(argv[2]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 8bdf16b8ba60..7f73fa32a590 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -57,23 +57,15 @@
* pagemap kernel ABI bits
*/
-#define PM_ENTRY_BYTES sizeof(uint64_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
-
+#define PM_ENTRY_BYTES 8
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define PM_SOFT_DIRTY (1ULL << 55)
+#define PM_MMAP_EXCLUSIVE (1ULL << 56)
+#define PM_FILE (1ULL << 61)
+#define PM_SWAP (1ULL << 62)
+#define PM_PRESENT (1ULL << 63)
/*
* kernel page flags
@@ -100,6 +92,8 @@
#define KPF_SLOB_FREE 49
#define KPF_SLUB_FROZEN 50
#define KPF_SLUB_DEBUG 51
+#define KPF_FILE 62
+#define KPF_MMAP_EXCLUSIVE 63
#define KPF_ALL_BITS ((uint64_t)~0ULL)
#define KPF_HACKERS_BITS (0xffffULL << 32)
@@ -149,6 +143,9 @@ static const char * const page_flag_names[] = {
[KPF_SLOB_FREE] = "P:slob_free",
[KPF_SLUB_FROZEN] = "A:slub_frozen",
[KPF_SLUB_DEBUG] = "E:slub_debug",
+
+ [KPF_FILE] = "F:file",
+ [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
};
@@ -452,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
if (pme & PM_SOFT_DIRTY)
flags |= BIT(SOFTDIRTY);
+ if (pme & PM_FILE)
+ flags |= BIT(FILE);
+ if (pme & PM_MMAP_EXCLUSIVE)
+ flags |= BIT(MMAP_EXCLUSIVE);
return flags;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b8a44453670..e69a5cb99571 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
return young;
}
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ /*
+ * Even though we do not flush TLB, this will still adversely
+ * affect performance on pre-Haswell Intel EPT, where there is
+ * no EPT Access Bit to clear so that we have to tear down EPT
+ * tables instead. If we find this unacceptable, we can always
+ * add a parameter to kvm_age_hva so that it effectively doesn't
+ * do anything on clear_young.
+ *
+ * Also note that currently we never issue secondary TLB flushes
+ * from clear_young, leaving this job up to the regular system
+ * cadence. If we find this inaccurate, we might come up with a
+ * more sophisticated heuristic later.
+ */
+ young = kvm_age_hva(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return young;
+}
+
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,