aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2015-08-24 20:51:53 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2015-08-24 20:51:54 +1000
commit105bbd20597263b83218a5ded6794da83bc132c0 (patch)
tree453abf8340dc88e27f9979ead0608a80dda81483
parent4a2f82bd7c412910eaab7a581e6cfcf6017bca6c (diff)
parent6ea736ae2de25a00864cdc4338b940bccf4a1ca6 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--CREDITS4
-rw-r--r--Documentation/DMA-API.txt7
-rw-r--r--Documentation/blockdev/zram.txt3
-rw-r--r--Documentation/devicetree/bindings/w1/omap-hdq.txt7
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt40
-rw-r--r--Documentation/filesystems/dax.txt7
-rw-r--r--Documentation/filesystems/ocfs2-online-filecheck.txt95
-rw-r--r--Documentation/filesystems/proc.txt46
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/printk-formats.txt8
-rw-r--r--Documentation/sysrq.txt3
-rw-r--r--Documentation/vm/00-INDEX2
-rw-r--r--Documentation/vm/hugetlbpage.txt15
-rw-r--r--Documentation/vm/idle_page_tracking.txt98
-rw-r--r--Documentation/vm/pagemap.txt28
-rw-r--r--Documentation/vm/userfaultfd.txt144
-rw-r--r--Documentation/w1/masters/omap-hdq6
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/include/uapi/asm/mman.h4
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/boot/compressed/decompress.c2
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/mach-at91/pm.c2
-rw-r--r--arch/arm/mach-imx/pm-imx5.c2
-rw-r--r--arch/arm/mach-imx/pm-imx6.c2
-rw-r--r--arch/arm/mach-socfpga/pm.c2
-rw-r--r--arch/arm/mach-vexpress/spc.c2
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/kernel/setup.c62
-rw-r--r--arch/h8300/boot/compressed/misc.c2
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/m32r/boot/compressed/misc.c3
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/boot/compressed/decompress.c4
-rw-r--r--arch/mips/include/uapi/asm/mman.h7
-rw-r--r--arch/mips/include/uapi/asm/unistd.h15
-rw-r--r--arch/mips/kernel/scall32-o32.S1
-rw-r--r--arch/mips/kernel/scall64-64.S1
-rw-r--r--arch/mips/kernel/scall64-n32.S1
-rw-r--r--arch/mips/kernel/scall64-o32.S1
-rw-r--r--arch/parisc/include/uapi/asm/mman.h4
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h2
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h2
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h1
-rw-r--r--arch/powerpc/include/uapi/asm/unistd.h1
-rw-r--r--arch/powerpc/platforms/512x/clock-commonclk.c4
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/boot/compressed/misc.c2
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sh/boot/compressed/misc.c2
-rw-r--r--arch/sh/mm/init.c4
-rw-r--r--arch/sh/mm/numa.c4
-rw-r--r--arch/sparc/include/asm/pgtable_32.h2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h9
-rw-r--r--arch/sparc/include/uapi/asm/mman.h1
-rw-r--r--arch/tile/Kconfig1
-rw-r--r--arch/tile/include/uapi/asm/mman.h1
-rw-r--r--arch/unicore32/boot/compressed/misc.c4
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c9
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c24
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/vmx.c8
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/tlb.c1
-rw-r--r--arch/x86/platform/efi/efi.c4
-rw-r--r--arch/x86/platform/uv/uv_nmi.c6
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h7
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/acpi/acpi_apd.c3
-rw-r--r--drivers/acpi/acpi_lpss.c2
-rw-r--r--drivers/block/zram/zram_drv.c30
-rw-r--r--drivers/block/zram/zram_drv.h1
-rw-r--r--drivers/clk/clk-mb86s7x.c2
-rw-r--r--drivers/clk/clk-moxart.c4
-rw-r--r--drivers/clk/samsung/clk-pll.c2
-rw-r--r--drivers/clk/samsung/clk.c9
-rw-r--r--drivers/clk/tegra/clk-tegra-pmc.c2
-rw-r--r--drivers/clk/tegra/clk.c2
-rw-r--r--drivers/crypto/qat/qat_common/adf_transport_debug.c16
-rw-r--r--drivers/firmware/efi/Kconfig2
-rw-r--r--drivers/gpu/drm/drm_vm.c8
-rw-r--r--drivers/media/platform/coda/coda-common.c2
-rw-r--r--drivers/misc/sram.c8
-rw-r--r--drivers/net/wireless/ath/wil6210/debugfs.c35
-rw-r--r--drivers/parisc/ccio-dma.c13
-rw-r--r--drivers/parisc/sba_iommu.c9
-rw-r--r--drivers/pci/pci-driver.c2
-rw-r--r--drivers/s390/crypto/zcrypt_api.c10
-rw-r--r--drivers/tty/sysrq.c11
-rw-r--r--drivers/video/console/Kconfig2
-rw-r--r--drivers/w1/masters/omap_hdq.c224
-rw-r--r--fs/9p/v9fs.c8
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/aio.c27
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/cachefiles/daemon.c84
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/coredump.c46
-rw-r--r--fs/dax.c197
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/exofs/super.c8
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c12
-rw-r--r--fs/ext4/super.c19
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c61
-rw-r--r--fs/fat/inode.c75
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c302
-rw-r--r--fs/inode.c2
-rw-r--r--fs/libfs.c26
-rw-r--r--fs/mpage.c23
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/notify/dnotify/dnotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c8
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/fsnotify.c11
-rw-r--r--fs/notify/fsnotify.h21
-rw-r--r--fs/notify/inode_mark.c20
-rw-r--r--fs/notify/mark.c113
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/ntfs/super.c21
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c245
-rw-r--r--fs/ocfs2/aops.c54
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c123
-rw-r--r--fs/ocfs2/cluster/nodemanager.c50
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c25
-rw-r--r--fs/ocfs2/filecheck.c571
-rw-r--r--fs/ocfs2/filecheck.h48
-rw-r--r--fs/ocfs2/inode.c245
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c100
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c81
-rw-r--r--fs/ocfs2/stack_user.c50
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/suballoc.c90
-rw-r--r--fs/ocfs2/super.c78
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/overlayfs/super.c6
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c113
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/task_mmu.c323
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/seq_file.c42
-rw-r--r--fs/super.c8
-rw-r--r--fs/userfaultfd.c1330
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_file.c30
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--include/asm-generic/early_ioremap.h6
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/cred.h8
-rw-r--r--include/linux/dax.h39
-rw-r--r--include/linux/dmapool.h6
-rw-r--r--include/linux/fs.h15
-rw-r--r--include/linux/fsnotify_backend.h55
-rw-r--r--include/linux/genalloc.h6
-rw-r--r--include/linux/gfp.h5
-rw-r--r--include/linux/huge_mm.h24
-rw-r--r--include/linux/hugetlb.h41
-rw-r--r--include/linux/kernel.h129
-rw-r--r--include/linux/kexec.h19
-rw-r--r--include/linux/kmod.h2
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h392
-rw-r--r--include/linux/mm.h79
-rw-r--r--include/linux/mm_types.h28
-rw-r--r--include/linux/mmu_notifier.h46
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/nmi.h15
-rw-r--r--include/linux/oom.h38
-rw-r--r--include/linux/page-flags.h248
-rw-r--r--include/linux/page-isolation.h5
-rw-r--r--include/linux/page_ext.h4
-rw-r--r--include/linux/page_idle.h110
-rw-r--r--include/linux/pagemap.h25
-rw-r--r--include/linux/parse-integer.h188
-rw-r--r--include/linux/pci.h2
-rw-r--r--include/linux/poison.h17
-rw-r--r--include/linux/printk.h14
-rw-r--r--include/linux/rmap.h12
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/seq_file.h39
-rw-r--r--include/linux/slab.h10
-rw-r--r--include/linux/smpboot.h11
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/string_helpers.h14
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/swapops.h37
-rw-r--r--include/linux/syscalls.h3
-rw-r--r--include/linux/userfaultfd_k.h85
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/wait.h5
-rw-r--r--include/linux/watchdog.h8
-rw-r--r--include/linux/zbud.h2
-rw-r--r--include/linux/zpool.h6
-rw-r--r--include/linux/zsmalloc.h6
-rw-r--r--include/net/sock.h33
-rw-r--r--include/trace/events/tlb.h3
-rw-r--r--include/uapi/asm-generic/mman-common.h6
-rw-r--r--include/uapi/asm-generic/mman.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/kernel-page-flags.h1
-rw-r--r--include/uapi/linux/prctl.h7
-rw-r--r--include/uapi/linux/securebits.h11
-rw-r--r--include/uapi/linux/userfaultfd.h169
-rw-r--r--init/Kconfig22
-rw-r--r--init/initramfs.c4
-rw-r--r--init/main.c1
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/msgutil.c2
-rw-r--r--ipc/shm.c4
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/kexec.c2531
-rw-r--r--kernel/kexec_core.c1534
-rw-r--r--kernel/kexec_file.c1045
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c189
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/Makefile3
-rw-r--r--lib/bitmap.c43
-rw-r--r--lib/cmdline.c44
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/decompress_bunzip2.c6
-rw-r--r--lib/decompress_inflate.c31
-rw-r--r--lib/decompress_unlz4.c6
-rw-r--r--lib/decompress_unlzma.c9
-rw-r--r--lib/decompress_unlzo.c13
-rw-r--r--lib/decompress_unxz.c12
-rw-r--r--lib/genalloc.c110
-rw-r--r--lib/kstrtox.c254
-rw-r--r--lib/kstrtox.h1
-rw-r--r--lib/parse-integer.c222
-rw-r--r--lib/parser.c33
-rw-r--r--lib/show_mem.c4
-rw-r--r--lib/string_helpers.c20
-rw-r--r--lib/swiotlb.c2
-rw-r--r--lib/test-kstrtox.c6
-rw-r--r--lib/test-parse-integer.c563
-rw-r--r--lib/vsprintf.c101
-rw-r--r--lib/zlib_deflate/deftree.c6
-rw-r--r--lib/zlib_deflate/defutil.h16
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c145
-rw-r--r--mm/debug.c5
-rw-r--r--mm/dmapool.c14
-rw-r--r--mm/early_ioremap.c22
-rw-r--r--mm/filemap.c49
-rw-r--r--mm/gup.c70
-rw-r--r--mm/huge_memory.c265
-rw-r--r--mm/hugetlb.c475
-rw-r--r--mm/hwpoison-inject.c7
-rw-r--r--mm/internal.h15
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c187
-rw-r--r--mm/memblock.c24
-rw-r--r--mm/memcontrol.c497
-rw-r--r--mm/memory-failure.c117
-rw-r--r--mm/memory.c76
-rw-r--r--mm/memory_hotplug.c36
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/memtest.c23
-rw-r--r--mm/migrate.c35
-rw-r--r--mm/mlock.c90
-rw-r--r--mm/mmap.c56
-rw-r--r--mm/mmu_notifier.c17
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c50
-rw-r--r--mm/oom_kill.c142
-rw-r--r--mm/page_alloc.c74
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_idle.c232
-rw-r--r--mm/page_isolation.c42
-rw-r--r--mm/rmap.c180
-rw-r--r--mm/shmem.c18
-rw-r--r--mm/slab.c15
-rw-r--r--mm/slab.h9
-rw-r--r--mm/slab_common.c28
-rw-r--r--mm/slob.c13
-rw-r--r--mm/slub.c206
-rw-r--r--mm/swap.c47
-rw-r--r--mm/swap_state.c41
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/userfaultfd.c308
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmscan.c190
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zbud.c10
-rw-r--r--mm/zpool.c37
-rw-r--r--mm/zsmalloc.c235
-rw-r--r--mm/zswap.c761
-rw-r--r--net/ceph/ceph_common.c7
-rw-r--r--net/sunrpc/sched.c2
-rwxr-xr-xscripts/Lindent3
-rwxr-xr-xscripts/checkpatch.pl162
-rw-r--r--scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci84
-rwxr-xr-xscripts/decode_stacktrace.sh5
-rwxr-xr-xscripts/kernel-doc38
-rw-r--r--scripts/spelling.txt5
-rw-r--r--security/commoncap.c103
-rw-r--r--security/keys/process_keys.c1
-rw-r--r--security/selinux/hooks.c2
-rw-r--r--sound/core/oss/mixer_oss.c7
-rw-r--r--sound/core/oss/pcm_oss.c13
-rw-r--r--sound/core/pcm.c13
-rw-r--r--sound/core/pcm_memory.c11
-rw-r--r--sound/pci/ac97/ac97_codec.c9
-rw-r--r--sound/soc/soc-core.c9
-rw-r--r--tools/testing/selftests/vm/Makefile8
-rw-r--r--tools/testing/selftests/vm/hugetlbfstest.c86
-rw-r--r--tools/testing/selftests/vm/mlock2-tests.c657
-rw-r--r--tools/testing/selftests/vm/on-fault-limit.c47
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests30
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c638
-rw-r--r--tools/vm/page-types.c35
-rw-r--r--virt/kvm/kvm_main.c31
393 files changed, 16477 insertions, 6716 deletions
diff --git a/CREDITS b/CREDITS
index bcb8efaa9459..8207cc62ee9d 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2992,6 +2992,10 @@ S: 2200 Mission College Blvd
S: Santa Clara, CA 95052
S: USA
+N: Anil Ravindranath
+E: anil_ravindranath@pmc-sierra.com
+D: PMC-Sierra MaxRAID driver
+
N: Eric S. Raymond
E: esr@thyrsus.com
W: http://www.tuxedo.org/~esr/
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 7eba542eff7c..edccacd4f048 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
from this pool must not cross 4KByte boundaries.
+ void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+
+Wraps dma_pool_alloc() and also zeroes the returned memory if the
+allocation attempt succeeded.
+
+
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
dma_addr_t *dma_handle);
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index c4de576093af..62435bb25266 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -144,7 +144,8 @@ mem_used_max RW the maximum amount memory zram have consumed to
store compressed data
mem_limit RW the maximum amount of memory ZRAM can use to store
the compressed data
-num_migrated RO the number of objects migrated migrated by compaction
+pages_compacted RO the number of pages freed during compaction
+ (available only via zram<id>/mm_stat node)
compact WO trigger memory compaction
WARNING
diff --git a/Documentation/devicetree/bindings/w1/omap-hdq.txt b/Documentation/devicetree/bindings/w1/omap-hdq.txt
index fef794741bd1..913c5f91a0f9 100644
--- a/Documentation/devicetree/bindings/w1/omap-hdq.txt
+++ b/Documentation/devicetree/bindings/w1/omap-hdq.txt
@@ -1,11 +1,15 @@
* OMAP HDQ One wire bus master controller
Required properties:
-- compatible : should be "ti,omap3-1w"
+- compatible : should be "ti,omap3-1w" or "ti,am4372-hdq"
- reg : Address and length of the register set for the device
- interrupts : interrupt line.
- ti,hwmods : "hdq1w"
+Optional properties:
+- ti,mode: should be "hdq": HDQ mode "1w": one-wire mode.
+ If not specified HDQ mode is implied.
+
Example:
- From omap3.dtsi
@@ -14,4 +18,5 @@ Example:
reg = <0x480b2000 0x1000>;
interrupts = <58>;
ti,hwmods = "hdq1w";
+ ti,mode = "hdq";
};
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644
index 000000000000..261b92e2fb1a
--- /dev/null
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -0,0 +1,40 @@
+#
+# Feature name: batch-unmap-tlb-flush
+# Kconfig: ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+# description: arch supports deferral of TLB flush until multiple pages are unmapped
+#
+ -----------------------
+ | arch |status|
+ -----------------------
+ | alpha: | TODO |
+ | arc: | TODO |
+ | arm: | TODO |
+ | arm64: | TODO |
+ | avr32: | .. |
+ | blackfin: | TODO |
+ | c6x: | .. |
+ | cris: | .. |
+ | frv: | .. |
+ | h8300: | .. |
+ | hexagon: | TODO |
+ | ia64: | TODO |
+ | m32r: | TODO |
+ | m68k: | .. |
+ | metag: | TODO |
+ | microblaze: | .. |
+ | mips: | TODO |
+ | mn10300: | TODO |
+ | nios2: | .. |
+ | openrisc: | .. |
+ | parisc: | TODO |
+ | powerpc: | TODO |
+ | s390: | TODO |
+ | score: | .. |
+ | sh: | TODO |
+ | sparc: | TODO |
+ | tile: | TODO |
+ | um: | .. |
+ | unicore32: | .. |
+ | x86: | ok |
+ | xtensa: | TODO |
+ -----------------------
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7af2851d667c..7bde64014a89 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -60,9 +60,10 @@ Filesystem support consists of
- implementing the direct_IO address space operation, and calling
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
- implementing an mmap file operation for DAX files which sets the
- VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
- for fault and page_mkwrite (which should probably call dax_fault() and
- dax_mkwrite(), passing the appropriate get_block() callback)
+ VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+ include handlers for fault, pmd_fault and page_mkwrite (which should
+ probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+ appropriate get_block() callback)
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
- calling dax_zero_page_range() instead of zero_user() for DAX files
- ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt
new file mode 100644
index 000000000000..d3192372e117
--- /dev/null
+++ b/Documentation/filesystems/ocfs2-online-filecheck.txt
@@ -0,0 +1,95 @@
+ OCFS2 online file check
+ -----------------------
+
+This document will describe OCFS2 online file check feature.
+
+Introduction
+============
+OCFS2 is often used in high-availaibility systems. However, OCFS2 usually
+converts the filesystem to read-only on errors. This may not be necessary, since
+turning the filesystem read-only would affect other running processes as well,
+decreasing availability. Then, a mount option (errors=continue) was introduced,
+which would return the EIO to the calling process and terminate furhter
+processing so that the filesystem is not corrupted further. The filesystem is
+not converted to read-only, and the problematic file's inode number is reported
+in the kernel log. The user can try to check/fix this file via online filecheck
+feature.
+
+Scope
+=====
+This effort is to check/fix small issues which may hinder day-to-day operations
+of a cluster filesystem by turning the filesystem read-only. The scope of
+checking/fixing is at the file level, initially for regular files and eventually
+to all files (including system files) of the filesystem.
+
+In case of directory to file links is incorrect, the directory inode is
+reported as erroneous.
+
+This feature is not suited for extravagant checks which involve dependency of
+other components of the filesystem, such as but not limited to, checking if the
+bits for file blocks in the allocation has been set. In case of such an error,
+the offline fsck should/would be recommended.
+
+Finally, such an operation/feature should not be automated lest the filesystem
+may end up with more damage than before the repair attempt. So, this has to
+be performed using user interaction and consent.
+
+User interface
+==============
+When there are errors in the OCFS2 filesystem, they are usually accompanied
+by the inode number which caused the error. This inode number would be the
+input to check/fix the file.
+
+There is a sysfs file for each OCFS2 file system mounting:
+
+ /sys/fs/ocfs2/<devname>/filecheck
+
+Here, <devname> indicates the name of OCFS2 volumn device which has been already
+mounted. The file above would accept inode numbers. This could be used to
+communicate with kernel space, tell which file(inode number) will be checked or
+fixed. Currently, three operations are supported, which includes checking
+inode, fixing inode and setting the size of result record history.
+
+1. If you want to know what error exactly happened to <inode> before fixing, do
+
+ # echo "CHECK <inode>" > /sys/fs/ocfs2/<devname>/filecheck
+ # cat /sys/fs/ocfs2/<devname>/filecheck
+
+The output is like this:
+ INO TYPE DONE ERROR
+39502 0 1 GENERATION
+
+<INO> lists the inode numbers.
+<TYPE> is what kind of operation you've done, 0 for inode check,1 for inode fix.
+<DONE> indicates whether the operation has been finished.
+<ERROR> says what kind of errors was found. For the detailed error numbers,
+please refer to the file linux/fs/ocfs2/filecheck.h.
+
+2. If you determine to fix this inode, do
+
+ # echo "FIX <inode>" > /sys/fs/ocfs2/<devname>/filecheck
+ # cat /sys/fs/ocfs2/<devname>/filecheck
+
+The output is like this:
+ INO TYPE DONE ERROR
+39502 1 1 SUCCESS
+
+This time, the <ERROR> column indicates whether this fix is successful or not.
+
+3. The record cache is used to store the history of check/fix result. Its
+defalut size is 10, and can be adjust between the range of 10 ~ 100. You can
+adjust the size like this:
+
+ # echo "SET <size>" > /sys/fs/ocfs2/<devname>/filecheck
+
+Fixing stuff
+============
+On receivng the inode, the filesystem would read the inode and the
+file metadata. In case of errors, the filesystem would fix the errors
+and report the problems it fixed in the kernel log. As a precautionary measure,
+the inode must first be checked for errors before performing a final fix.
+
+The inode and the result history will be maintained temporarily in a
+small linked list buffer which would contain the last (N) inodes
+fixed/checked, the detailed errors which were fixed/checked are printed in the
+kernel log.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6f7fafde0884..3e75cabb28ce 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -174,6 +174,7 @@ read the file /proc/PID/status:
VmLib: 1412 kB
VmPTE: 20 kb
VmSwap: 0 kB
+ HugetlbPages: 0 kB (0*2048kB)
Threads: 1
SigQ: 0/28578
SigPnd: 0000000000000000
@@ -237,6 +238,8 @@ Table 1-2: Contents of the status files (as of 4.1)
VmPTE size of page table entries
VmPMD size of second level page tables
VmSwap size of swap usage (the number of referred swapents)
+ HugetlbPages size of hugetlb memory portions (with additional info
+ about number of mapped hugepages for each page size)
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread
@@ -423,7 +426,10 @@ Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
+AnonHugePages: 0 kB
+HugetlbPages: 0 kB
Swap: 0 kB
+SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
@@ -433,19 +439,36 @@ the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
(size), the amount of the mapping that is currently resident in RAM (RSS), the
process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping. Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
-by only one process, is accounted as private and not as shared. "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
-"Anonymous" shows the amount of memory that does not belong to any file. Even
-a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
-and a page is modified, the file page is replaced by a private anonymous copy.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e. is currently used by only one process, is
+accounted as private and not as shared.
+
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
+
+"Anonymous" shows the amount of memory that does not belong to any file.
+Even a mapping associated with a file may contain anonymous pages: when
+MAP_PRIVATE and a page is modified, the file page is replaced by a private
+anonymous copy.
+
+"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+
+"HugetlbPages" shows the ammount of memory backed by hugetlbfs page.
+
"Swap" shows how much would-be-anonymous memory is also used, but out on
-swap.
+swap. SwapPss" shows proportional swap share of this mapping.
+
+"VmFlags" field deserves a separate description. This member represents
+the kernel flags associated with the particular virtual memory area in two
+letter encoded manner. The codes are the following:
-"VmFlags" field deserves a separate description. This member represents the kernel
-flags associated with the particular virtual memory area in two letter encoded
-manner. The codes are the following:
rd - readable
wr - writeable
ex - executable
@@ -463,6 +486,7 @@ manner. The codes are the following:
rr - random read advise provided
dc - do not copy area on fork
de - do not expand area on remapping
+ lf - mark area to lock pages when faulted in, do not pre-populate
ac - area is accountable
nr - swap space is not reserved for the area
ht - area uses huge tlb pages
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index ab288fbad3d3..4285e0746f34 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -304,6 +304,7 @@ Code Seq#(hex) Include File Comments
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
+0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
0xAD 00 Netfilter device in development:
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 2216eb187c21..2ec6d84f391c 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -244,6 +244,14 @@ dentry names:
Passed by reference.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
+ Passed by reference (NULL for "current").
+
struct va_format:
%pV
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 267f39386f99..13f5619b2203 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'e' - Send a SIGTERM to all processes, except for init.
-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.
'g' - Used by kgdb (kernel debugger)
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 081c49777abb..6a5e2a102a45 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -14,6 +14,8 @@ hugetlbpage.txt
- a brief summary of hugetlbpage support in the Linux kernel.
hwpoison.txt
- explains what hwpoison is
+idle_page_tracking.txt
+ - description of the idle page tracking feature.
ksm.txt
- how to use the Kernel Samepage Merging feature.
numa
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 030977fb8d2d..54dd9b9c6c31 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -329,7 +329,14 @@ Examples
3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
-4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
- wide range of userspace tools to help with huge page usability, environment
- setup, and control. Furthermore it provides useful test cases that should be
- used when modifying code to ensure no regressions are introduced.
+4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
+ provides a wide range of userspace tools to help with huge page usability,
+ environment setup, and control.
+
+Kernel development regression testing
+=====================================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions. In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644
index 000000000000..85dcc3bb85dc
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.txt
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+ /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+ /proc/pid/pagemap if the workload is represented by a process, or by
+ filtering out alien pages using /proc/kpagecgroup in case the workload is
+ placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+ one wants to ignore certain types of pages, e.g. mlocked pages since they
+ are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+ or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+ because a process needs filesystem metadata stored in it (e.g. lists a
+ directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 6bfbc172cdb9..0e1e55588b59 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
userspace programs to examine the page tables and related information by
reading files in /proc.
-There are three components to pagemap:
+There are four components to pagemap:
* /proc/pid/pagemap. This file lets a userspace process find out which
physical frame each virtual page is mapped to. It contains one 64-bit
@@ -16,11 +16,17 @@ There are three components to pagemap:
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
- * Bits 56-60 zero
- * Bit 61 page is file-page or shared-anon
+ * Bit 56 page exclusively mapped (since 4.2)
+ * Bits 57-60 zero
+ * Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
* Bit 63 page present
+ Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
+ In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from
+ 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
+ Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
+
If the page is not present but in swap, then the PFN contains an
encoding of the swap file number and the page's offset into the
swap. Unmapped pages return a null PFN. This allows determining
@@ -64,6 +70,11 @@ There are three components to pagemap:
22. THP
23. BALLOON
24. ZERO_PAGE
+ 25. IDLE
+
+ * /proc/kpagecgroup. This file contains a 64-bit inode number of the
+ memory cgroup each page is charged to, indexed by PFN. Only available when
+ CONFIG_MEMCG is set.
Short descriptions to the page flags:
@@ -110,6 +121,12 @@ Short descriptions to the page flags:
24. ZERO_PAGE
zero page for pfn_zero or huge_zero page
+25. IDLE
+ page has not been accessed since it was marked idle (see
+ Documentation/vm/idle_page_tracking.txt). Note that this flag may be
+ stale in case the page was accessed via a PTE. To make sure the flag
+ is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
+
[IO related page flags]
1. ERROR IO error occurred
3. UPTODATE page has up-to-date data
@@ -159,3 +176,8 @@ Other notes:
Reading from any of the files will return -EINVAL if you are not starting
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
into the file), or if the size of the read is not a multiple of 8 bytes.
+
+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
+always 12 at most architectures). Since Linux 3.11 their meaning changes
+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
+flags unconditionally.
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 000000000000..70a3c94d1941
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,144 @@
+= Userfaultfd =
+
+== Objective ==
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+== Design ==
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+ happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+ registered in the userfaultfd that allows userland to efficiently
+ resolve the userfaults it receives via 1) or to manage the virtual
+ memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+== API ==
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+== QEMU/KVM ==
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq
index 884dc284b215..234522709a5f 100644
--- a/Documentation/w1/masters/omap-hdq
+++ b/Documentation/w1/masters/omap-hdq
@@ -44,3 +44,9 @@ e.g:
insmod omap_hdq.ko W1_ID=2
inamod w1_bq27000.ko F_ID=2
+The driver also supports 1-wire mode. In this mode, there is no need to
+pass slave ID as parameter. The driver will auto-detect slaves connected
+to the bus using SEARCH_ROM procedure. 1-wire mode can be selected by
+setting "ti,mode" property to "1w" in DT (see
+Documentation/devicetree/bindings/w1/omap-hdq.txt for more details).
+By default driver is in HDQ mode.
diff --git a/MAINTAINERS b/MAINTAINERS
index bd32f3a63826..1f25f3c168f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8190,10 +8190,9 @@ F: drivers/hwmon/pmbus/
F: include/linux/i2c/pmbus.h
PMC SIERRA MaxRAID DRIVER
-M: Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
L: linux-scsi@vger.kernel.org
W: http://www.pmc-sierra.com/
-S: Supported
+S: Orphan
F: drivers/scsi/pmcraid.*
PMC SIERRA PM8001 DRIVER
diff --git a/arch/Kconfig b/arch/Kconfig
index 8f3564930580..4e949e58b192 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,6 +2,9 @@
# General architecture dependent options
#
+config KEXEC_CORE
+ bool
+
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 0086b472bc2b..d828beb5e69b 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -37,6 +37,9 @@
#define MCL_CURRENT 8192 /* lock all currently mapped pages */
#define MCL_FUTURE 16384 /* lock all additions to address space */
+#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
@@ -44,6 +47,7 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_SPACEAVAIL 5 /* ensure resources are available */
#define MADV_DONTNEED 6 /* don't need these pages */
+#define MADV_FREE 7 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 681e49f50403..74993509dc3b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2005,6 +2005,7 @@ config KEXEC
bool "Kexec system call (EXPERIMENTAL)"
depends on (!SMP || PM_SLEEP_SMP)
depends on !CPU_V7M
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c
index bd245d34952d..a0765e7ed6c7 100644
--- a/arch/arm/boot/compressed/decompress.c
+++ b/arch/arm/boot/compressed/decompress.c
@@ -57,5 +57,5 @@ extern char * strstr(const char * s1, const char *s2);
int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
{
- return decompress(input, len, NULL, NULL, output, NULL, error);
+ return __decompress(input, len, NULL, NULL, output, 0, NULL, error);
}
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..6d6012a320b2 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 265ffeb2037e..80e277cfcc8b 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
return;
}
- sram_pool = gen_pool_get(&pdev->dev);
+ sram_pool = gen_pool_get(&pdev->dev, NULL);
if (!sram_pool) {
pr_warn("%s: sram pool unavailable!\n", __func__);
return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
goto put_node;
}
- ocram_pool = gen_pool_get(&pdev->dev);
+ ocram_pool = gen_pool_get(&pdev->dev, NULL);
if (!ocram_pool) {
pr_warn("%s: ocram pool unavailable!\n", __func__);
ret = -ENODEV;
diff --git a/arch/arm/mach-vexpress/spc.c b/arch/arm/mach-vexpress/spc.c
index 5766ce2be32b..b9e953824775 100644
--- a/arch/arm/mach-vexpress/spc.c
+++ b/arch/arm/mach-vexpress/spc.c
@@ -577,7 +577,7 @@ static int __init ve_spc_clk_init(void)
pr_warn("failed to register cpu%d clock\n", cpu);
continue;
}
- if (clk_register_clkdev(clk, NULL, dev_name(cpu_dev))) {
+ if (clk_register_clkdev(clk, NULL, "%s", dev_name(cpu_dev))) {
pr_warn("failed to register cpu%d clock lookup\n", cpu);
continue;
}
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6900b2d95371..d374191fd413 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -325,10 +325,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 888478881243..6bab21f84a9f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -339,6 +339,67 @@ static void __init request_standard_resources(void)
}
}
+#ifdef CONFIG_BLK_DEV_INITRD
+/*
+ * Relocate initrd if it is not completely within the linear mapping.
+ * This would be the case if mem= cuts out all or part of it.
+ */
+static void __init relocate_initrd(void)
+{
+ phys_addr_t orig_start = __virt_to_phys(initrd_start);
+ phys_addr_t orig_end = __virt_to_phys(initrd_end);
+ phys_addr_t ram_end = memblock_end_of_DRAM();
+ phys_addr_t new_start;
+ unsigned long size, to_free = 0;
+ void *dest;
+
+ if (orig_end <= ram_end)
+ return;
+
+ /*
+ * Any of the original initrd which overlaps the linear map should
+ * be freed after relocating.
+ */
+ if (orig_start < ram_end)
+ to_free = ram_end - orig_start;
+
+ size = orig_end - orig_start;
+
+ /* initrd needs to be relocated completely inside linear mapping */
+ new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+ size, PAGE_SIZE);
+ if (!new_start)
+ panic("Cannot relocate initrd of size %ld\n", size);
+ memblock_reserve(new_start, size);
+
+ initrd_start = __phys_to_virt(new_start);
+ initrd_end = initrd_start + size;
+
+ pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
+ orig_start, orig_start + size - 1,
+ new_start, new_start + size - 1);
+
+ dest = (void *)initrd_start;
+
+ if (to_free) {
+ memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
+ dest += to_free;
+ }
+
+ copy_from_early_mem(dest, orig_start + to_free, size - to_free);
+
+ if (to_free) {
+ pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
+ orig_start, orig_start + to_free - 1);
+ memblock_free(orig_start, to_free);
+ }
+}
+#else
+static inline void __init relocate_initrd(void)
+{
+}
+#endif
+
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
void __init setup_arch(char **cmdline_p)
@@ -372,6 +433,7 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
paging_init();
+ relocate_initrd();
request_standard_resources();
early_ioremap_reset();
diff --git a/arch/h8300/boot/compressed/misc.c b/arch/h8300/boot/compressed/misc.c
index 704274127c07..c4f2cfcb117b 100644
--- a/arch/h8300/boot/compressed/misc.c
+++ b/arch/h8300/boot/compressed/misc.c
@@ -70,5 +70,5 @@ void decompress_kernel(void)
free_mem_ptr = (unsigned long)&_end;
free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 42a91a7aa2b0..eb0249e37981 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
config KEXEC
bool "kexec system call"
depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/m32r/boot/compressed/misc.c b/arch/m32r/boot/compressed/misc.c
index 28a09529f206..3a7692745868 100644
--- a/arch/m32r/boot/compressed/misc.c
+++ b/arch/m32r/boot/compressed/misc.c
@@ -86,6 +86,7 @@ decompress_kernel(int mmu_on, unsigned char *zimage_data,
free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE;
puts("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output_data, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output_data, 0,
+ NULL, error);
puts("done.\nBooting the kernel.\n");
}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 2dd8f63bfbbb..498b567f007b 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -95,6 +95,7 @@ config MMU_SUN3
config KEXEC
bool "kexec system call"
depends on M68KCLASSIC
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 9c90ce5fcd84..03e9180320b8 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2595,6 +2595,7 @@ source "kernel/Kconfig.preempt"
config KEXEC
bool "Kexec system call"
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c
index 54831069a206..080cd53bac36 100644
--- a/arch/mips/boot/compressed/decompress.c
+++ b/arch/mips/boot/compressed/decompress.c
@@ -111,8 +111,8 @@ void decompress_kernel(unsigned long boot_heap_start)
puts("\n");
/* Decompress the kernel with according algorithm */
- decompress((char *)zimage_start, zimage_size, 0, 0,
- (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, error);
+ __decompress((char *)zimage_start, zimage_size, 0, 0,
+ (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, 0, error);
/* FIXME: should we flush cache here? */
puts("Now, booting the kernel...\n");
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index cfcb876cae6b..a6f8daff8e3b 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -61,12 +61,19 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h
index c03088f9f514..d0bdfaa13ee9 100644
--- a/arch/mips/include/uapi/asm/unistd.h
+++ b/arch/mips/include/uapi/asm/unistd.h
@@ -377,16 +377,17 @@
#define __NR_memfd_create (__NR_Linux + 354)
#define __NR_bpf (__NR_Linux + 355)
#define __NR_execveat (__NR_Linux + 356)
+#define __NR_mlock2 (__NR_Linux + 357)
/*
* Offset of the last Linux o32 flavoured syscall
*/
-#define __NR_Linux_syscalls 356
+#define __NR_Linux_syscalls 357
#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
#define __NR_O32_Linux 4000
-#define __NR_O32_Linux_syscalls 356
+#define __NR_O32_Linux_syscalls 357
#if _MIPS_SIM == _MIPS_SIM_ABI64
@@ -711,16 +712,17 @@
#define __NR_memfd_create (__NR_Linux + 314)
#define __NR_bpf (__NR_Linux + 315)
#define __NR_execveat (__NR_Linux + 316)
+#define __NR_mlock2 (__NR_Linux + 317)
/*
* Offset of the last Linux 64-bit flavoured syscall
*/
-#define __NR_Linux_syscalls 316
+#define __NR_Linux_syscalls 317
#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
#define __NR_64_Linux 5000
-#define __NR_64_Linux_syscalls 316
+#define __NR_64_Linux_syscalls 317
#if _MIPS_SIM == _MIPS_SIM_NABI32
@@ -1049,15 +1051,16 @@
#define __NR_memfd_create (__NR_Linux + 318)
#define __NR_bpf (__NR_Linux + 319)
#define __NR_execveat (__NR_Linux + 320)
+#define __NR_mlock2 (__NR_Linux + 321)
/*
* Offset of the last N32 flavoured syscall
*/
-#define __NR_Linux_syscalls 320
+#define __NR_Linux_syscalls 321
#endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
#define __NR_N32_Linux 6000
-#define __NR_N32_Linux_syscalls 320
+#define __NR_N32_Linux_syscalls 321
#endif /* _UAPI_ASM_UNISTD_H */
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 4cc13508d967..b0b377aa1d5d 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -599,3 +599,4 @@ EXPORT(sys_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 4355 */
PTR sys_execveat
+ PTR sys_mlock2
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index a6f6b762c47a..f12eb03c0961 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -436,4 +436,5 @@ EXPORT(sys_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 5315 */
PTR sys_execveat
+ PTR sys_mlock2
.size sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 4b2010654c46..ecdd65a2b02c 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -429,4 +429,5 @@ EXPORT(sysn32_call_table)
PTR sys_memfd_create
PTR sys_bpf
PTR compat_sys_execveat /* 6320 */
+ PTR sys_mlock2
.size sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index f543ff4feef9..7a8b2dff16d3 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -584,4 +584,5 @@ EXPORT(sys32_call_table)
PTR sys_memfd_create
PTR sys_bpf /* 4355 */
PTR compat_sys_execveat
+ PTR sys_mlock2
.size sys32_call_table,.-sys32_call_table
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 294d251ca7b2..84f6bd365c7c 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -31,6 +31,9 @@
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
@@ -40,6 +43,7 @@
#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
+#define MADV_FREE 8 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b447918b9e2c..9a7057ec2154 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
config KEXEC
bool "kexec system call"
depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index fa1dfb7f7b48..85e15c8067be 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -507,9 +507,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 71f2b3f02cf8..4d65499ee1c1 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -368,3 +368,4 @@ SYSCALL_SPU(memfd_create)
SYSCALL_SPU(bpf)
COMPAT_SYS(execveat)
PPC64ONLY(switch_endian)
+SYSCALL_SPU(userfaultfd)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f4f8b667d75b..4a055b6c2a64 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
#include <uapi/asm/unistd.h>
-#define __NR_syscalls 364
+#define __NR_syscalls 365
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 6ea26df0a73c..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -22,6 +22,7 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index e4aa173dae62..6ad58d4c879b 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -386,5 +386,6 @@
#define __NR_bpf 361
#define __NR_execveat 362
#define __NR_switch_endian 363
+#define __NR_userfaultfd 364
#endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c
index c50ea76ba66c..4ff5539b34f9 100644
--- a/arch/powerpc/platforms/512x/clock-commonclk.c
+++ b/arch/powerpc/platforms/512x/clock-commonclk.c
@@ -993,9 +993,9 @@ static void mpc5121_clk_provide_migration_support(void)
clk = of_clk_get_by_name(np, clkname); \
if (IS_ERR(clk)) { \
clk = clkitem; \
- clk_register_clkdev(clk, clkname, devname); \
+ clk_register_clkdev(clk, clkname, "%s", devname); \
if (regnode) \
- clk_register_clkdev(clk, clkname, np->name); \
+ clk_register_clkdev(clk, clkname, "%s", np->name); \
did_register |= DID_REG_ ## regflag; \
pr_debug("clock alias name '%s' for dev '%s' pointer %p\n", \
clkname, devname, clk); \
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4827870f7a6d..1d57000b1b24 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KEXEC
def_bool y
+ select KEXEC_CORE
config AUDIT_ARCH
def_bool y
diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c
index 42506b371b74..4da604ebf6fd 100644
--- a/arch/s390/boot/compressed/misc.c
+++ b/arch/s390/boot/compressed/misc.c
@@ -167,7 +167,7 @@ unsigned long decompress_kernel(void)
#endif
puts("Uncompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
puts("Ok, booting the kernel.\n");
return (unsigned long) output;
}
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 50057fed819d..d514df7e04dd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
depends on SUPERH32 && MMU
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index 95470a472d2c..208a9753ab38 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -132,7 +132,7 @@ void decompress_kernel(void)
puts("Uncompressing Linux... ");
cache_control(CACHE_ENABLE);
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
cache_control(CACHE_DISABLE);
puts("Ok, booting the kernel.\n");
}
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a64157..17f486233db0 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
int arch_add_memory(int nid, u64 start, u64 size)
{
pg_data_t *pgdat;
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#ifdef CONFIG_MEMORY_HOTREMOVE
int arch_remove_memory(u64 start, u64 size)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
int ret;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index bce52ba66206..05713d190247 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
/* Don't allow bogus node assignment */
BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = end >> PAGE_SHIFT;
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_DOWN(end);
pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
PAGE_KERNEL);
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index f06b36a00a3b..91b963a887b7 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -14,7 +14,7 @@
#include <asm-generic/4level-fixup.h>
#include <linux/spinlock.h>
-#include <linux/swap.h>
+#include <linux/mm_types.h>
#include <asm/types.h>
#include <asm/pgtsrmmu.h>
#include <asm/vaddrs.h>
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..5833dc5ee7d7 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -717,6 +717,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return __pmd(pte_val(pte));
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ pte_t pte = __pte(pmd_val(pmd));
+
+ pte = pte_mkclean(pte);
+
+ return __pmd(pte_val(pte));
+}
+
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h
index 0b14df33cffa..9765896ecb2c 100644
--- a/arch/sparc/include/uapi/asm/mman.h
+++ b/arch/sparc/include/uapi/asm/mman.h
@@ -17,6 +17,7 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 2ba12d761723..106c21bd7f44 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
diff --git a/arch/tile/include/uapi/asm/mman.h b/arch/tile/include/uapi/asm/mman.h
index 81b8fc348d63..63ee13faf17d 100644
--- a/arch/tile/include/uapi/asm/mman.h
+++ b/arch/tile/include/uapi/asm/mman.h
@@ -36,6 +36,7 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
#endif /* _ASM_TILE_MMAN_H */
diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c
index 176d5bda3559..5c65dfee278c 100644
--- a/arch/unicore32/boot/compressed/misc.c
+++ b/arch/unicore32/boot/compressed/misc.c
@@ -119,8 +119,8 @@ unsigned long decompress_kernel(unsigned long output_start,
output_ptr = get_unaligned_le32(tmp);
arch_decomp_puts("Uncompressing Linux...");
- decompress(input_data, input_data_end - input_data, NULL, NULL,
- output_data, NULL, error);
+ __decompress(input_data, input_data_end - input_data, NULL, NULL,
+ output_data, 0, NULL, error);
arch_decomp_puts(" done, booting the kernel.\n");
return output_ptr;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a15635758360..923f1d3581e9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
@@ -1752,6 +1753,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1768,8 +1770,8 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
+ select KEXEC_CORE
select BUILD_BIN2C
- depends on KEXEC
depends on X86_64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f63797942bb5..79dac1758e7c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
#endif
debug_putstr("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, output_len,
+ NULL, error);
parse_elf(output);
/*
* 32-bit always performs relocations. 64-bit relocations are only
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
# define XLF23 0
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
# define XLF4 XLF_EFI_KEXEC
#else
# define XLF4 0
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..41e72a50c2ed 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,5 @@
371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
373 i386 shutdown sys_shutdown
+374 i386 userfaultfd sys_userfaultfd
+375 i386 mlock2 sys_mlock2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..23669007b85d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,8 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common userfaultfd sys_userfaultfd
+324 common mlock2 sys_mlock2
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
extern int in_crash_kexec;
#else
/* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 867da5bbb4a3..b964d54300e1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
#endif /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() { \
+ inc_irq_stat(irq_tlb_count); \
+ local_flush_tlb(); \
+}
+
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9ffdf25e5b86..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
return 0;
}
- watchdog_nmi_disable_all();
+ if (lockup_detector_suspend() != 0) {
+ pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+ return 0;
+ }
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
x86_pmu.commit_scheduling = NULL;
x86_pmu.stop_scheduling = NULL;
- watchdog_nmi_enable_all();
+ lockup_detector_resume();
get_online_cpus();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
* kind of shutdown from our side, we unregister the clock by writting anything
* that does not have the 'enable' bit set in the msr
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..22db575a2fec 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -703,7 +703,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b143c2d04420..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
return ramdisk_size;
}
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- unsigned long slop, clen, mapaddr;
- char *p, *q;
/* We need to move the initrd down into directly mapped mem */
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- q = (char *)initrd_start;
-
- /* Copy the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_memunmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
- ramdisk_image = get_ramdisk_image();
- ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a4eec30cc08..5477ab8c1c90 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This bitmap is used to indicate whether the vmclear
* operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
if (r)
return r;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
static void __exit vmx_exit(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty block */
- if (bi->start >= bi->end)
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
numa_remove_memblk_from(i--, mi);
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_end = end;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
if (is_uv_system()) {
unsigned int cpu;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
static void __init save_runtime_map(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *tmp, *p, *q = NULL;
int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
static void __init kexec_enter_virtual_mode(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *p;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
uv_nmi_sync_exit(0);
}
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
}
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 201aec0e0446..83c5150b06f9 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -74,12 +74,19 @@
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/block/genhd.c b/block/genhd.c
index 0c706f33a599..3213b66515f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
index a450e7af877c..1a49c2f382b8 100644
--- a/drivers/acpi/acpi_apd.c
+++ b/drivers/acpi/acpi_apd.c
@@ -64,7 +64,8 @@ static int acpi_apd_setup(struct apd_private_data *pdata)
dev_name(&pdata->adev->dev),
NULL, CLK_IS_ROOT,
dev_desc->fixed_clk_rate);
- clk_register_clkdev(clk, NULL, dev_name(&pdata->adev->dev));
+ clk_register_clkdev(clk, NULL, "%s",
+ dev_name(&pdata->adev->dev));
pdata->clk = clk;
}
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 502454c24e69..b045d3eee3b2 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -315,7 +315,7 @@ out:
return PTR_ERR(clk);
pdata->clk = clk;
- clk_register_clkdev(clk, dev_desc->clk_con_id, devname);
+ clk_register_clkdev(clk, dev_desc->clk_con_id, "%s", devname);
return 0;
}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9c01f5bfa33f..9fa15bb9d118 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
- unsigned long nr_migrated;
struct zram *zram = dev_to_zram(dev);
struct zram_meta *meta;
@@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
}
meta = zram->meta;
- nr_migrated = zs_compact(meta->mem_pool);
- atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ zs_compact(meta->mem_pool);
up_read(&zram->init_lock);
return len;
@@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
+ struct zs_pool_stats pool_stats;
u64 orig_size, mem_used = 0;
long max_used;
ssize_t ret;
+ memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
down_read(&zram->init_lock);
- if (init_done(zram))
+ if (init_done(zram)) {
mem_used = zs_get_total_pages(zram->meta->mem_pool);
+ zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ }
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.zero_pages),
- (u64)atomic64_read(&zram->stats.num_migrated));
+ pool_stats.pages_compacted);
up_read(&zram->init_lock);
return ret;
@@ -619,7 +622,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
uncmem = user_mem;
if (!uncmem) {
- pr_info("Unable to allocate temp memory\n");
+ pr_err("Unable to allocate temp memory\n");
ret = -ENOMEM;
goto out_cleanup;
}
@@ -716,7 +719,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
handle = zs_malloc(meta->mem_pool, clen);
if (!handle) {
- pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+ pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
index, clen);
ret = -ENOMEM;
goto out;
@@ -1036,7 +1039,7 @@ static ssize_t disksize_store(struct device *dev,
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
if (IS_ERR(comp)) {
- pr_info("Cannot initialise %s compressing backend\n",
+ pr_err("Cannot initialise %s compressing backend\n",
zram->compressor);
err = PTR_ERR(comp);
goto out_free_meta;
@@ -1214,7 +1217,7 @@ static int zram_add(void)
/* gendisk structure */
zram->disk = alloc_disk(1);
if (!zram->disk) {
- pr_warn("Error allocating disk structure for device %d\n",
+ pr_err("Error allocating disk structure for device %d\n",
device_id);
ret = -ENOMEM;
goto out_free_queue;
@@ -1263,7 +1266,8 @@ static int zram_add(void)
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
&zram_disk_attr_group);
if (ret < 0) {
- pr_warn("Error creating sysfs group");
+ pr_err("Error creating sysfs group for device %d\n",
+ device_id);
goto out_free_disk;
}
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
@@ -1403,13 +1407,13 @@ static int __init zram_init(void)
ret = class_register(&zram_control_class);
if (ret) {
- pr_warn("Unable to register zram-control class\n");
+ pr_err("Unable to register zram-control class\n");
return ret;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
- pr_warn("Unable to get major number\n");
+ pr_err("Unable to get major number\n");
class_unregister(&zram_control_class);
return -EBUSY;
}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6dbe2df506bf..8e92339686d7 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,7 +78,6 @@ struct zram_stats {
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
- atomic64_t num_migrated; /* no. of migrated object */
atomic64_t failed_reads; /* can happen when memory is too low */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
diff --git a/drivers/clk/clk-mb86s7x.c b/drivers/clk/clk-mb86s7x.c
index f39c25a22f43..1e2d8fca3daf 100644
--- a/drivers/clk/clk-mb86s7x.c
+++ b/drivers/clk/clk-mb86s7x.c
@@ -370,7 +370,7 @@ static int mb86s7x_clclk_of_init(void)
pr_err("failed to register cpu%d clock\n", cpu);
continue;
}
- if (clk_register_clkdev(clk, NULL, dev_name(cpu_dev))) {
+ if (clk_register_clkdev(clk, NULL, "%s", dev_name(cpu_dev))) {
pr_err("failed to register cpu%d clock lookup\n", cpu);
continue;
}
diff --git a/drivers/clk/clk-moxart.c b/drivers/clk/clk-moxart.c
index f37f719643ec..2d28d5c9b856 100644
--- a/drivers/clk/clk-moxart.c
+++ b/drivers/clk/clk-moxart.c
@@ -48,7 +48,7 @@ static void __init moxart_of_pll_clk_init(struct device_node *node)
return;
}
- clk_register_clkdev(clk, NULL, name);
+ clk_register_clkdev(clk, NULL, "%s", name);
of_clk_add_provider(node, of_clk_src_simple_get, clk);
}
CLK_OF_DECLARE(moxart_pll_clock, "moxa,moxart-pll-clock",
@@ -91,7 +91,7 @@ static void __init moxart_of_apb_clk_init(struct device_node *node)
return;
}
- clk_register_clkdev(clk, NULL, name);
+ clk_register_clkdev(clk, NULL, "%s", name);
of_clk_add_provider(node, of_clk_src_simple_get, clk);
}
CLK_OF_DECLARE(moxart_apb_clock, "moxa,moxart-apb-clock",
diff --git a/drivers/clk/samsung/clk-pll.c b/drivers/clk/samsung/clk-pll.c
index b7dd396100d8..9a504fa76864 100644
--- a/drivers/clk/samsung/clk-pll.c
+++ b/drivers/clk/samsung/clk-pll.c
@@ -1298,7 +1298,7 @@ static void __init _samsung_clk_register_pll(struct samsung_clk_provider *ctx,
if (!pll_clk->alias)
return;
- ret = clk_register_clkdev(clk, pll_clk->alias, pll_clk->dev_name);
+ ret = clk_register_clkdev(clk, pll_clk->alias, "%s", pll_clk->dev_name);
if (ret)
pr_err("%s: failed to register lookup for %s : %d",
__func__, pll_clk->name, ret);
diff --git a/drivers/clk/samsung/clk.c b/drivers/clk/samsung/clk.c
index f38a6c49f744..19d37abdca79 100644
--- a/drivers/clk/samsung/clk.c
+++ b/drivers/clk/samsung/clk.c
@@ -127,7 +127,8 @@ void __init samsung_clk_register_alias(struct samsung_clk_provider *ctx,
continue;
}
- ret = clk_register_clkdev(clk, list->alias, list->dev_name);
+ ret = clk_register_clkdev(clk, list->alias, "%s",
+ list->dev_name);
if (ret)
pr_err("%s: failed to register lookup %s\n",
__func__, list->alias);
@@ -207,7 +208,7 @@ void __init samsung_clk_register_mux(struct samsung_clk_provider *ctx,
/* register a clock lookup only if a clock alias is specified */
if (list->alias) {
- ret = clk_register_clkdev(clk, list->alias,
+ ret = clk_register_clkdev(clk, list->alias, "%s",
list->dev_name);
if (ret)
pr_err("%s: failed to register lookup %s\n",
@@ -246,7 +247,7 @@ void __init samsung_clk_register_div(struct samsung_clk_provider *ctx,
/* register a clock lookup only if a clock alias is specified */
if (list->alias) {
- ret = clk_register_clkdev(clk, list->alias,
+ ret = clk_register_clkdev(clk, list->alias, "%s",
list->dev_name);
if (ret)
pr_err("%s: failed to register lookup %s\n",
@@ -275,7 +276,7 @@ void __init samsung_clk_register_gate(struct samsung_clk_provider *ctx,
/* register a clock lookup only if a clock alias is specified */
if (list->alias) {
- ret = clk_register_clkdev(clk, list->alias,
+ ret = clk_register_clkdev(clk, list->alias, "%s",
list->dev_name);
if (ret)
pr_err("%s: failed to register lookup %s\n",
diff --git a/drivers/clk/tegra/clk-tegra-pmc.c b/drivers/clk/tegra/clk-tegra-pmc.c
index 91377abfefa1..6154dc7edbab 100644
--- a/drivers/clk/tegra/clk-tegra-pmc.c
+++ b/drivers/clk/tegra/clk-tegra-pmc.c
@@ -109,7 +109,7 @@ void __init tegra_pmc_clk_init(void __iomem *pmc_base,
0, pmc_base + PMC_CLK_OUT_CNTRL,
data->gate_shift, 0, &clk_out_lock);
*dt_clk = clk;
- clk_register_clkdev(clk, data->dev_name, data->gate_name);
+ clk_register_clkdev(clk, data->dev_name, "%s", data->gate_name);
}
/* blink */
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index 2a3a4fe803d6..80db45615224 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -320,7 +320,7 @@ void __init tegra_register_devclks(struct tegra_devclk *dev_clks, int num)
for (i = 0; i < num; i++, dev_clks++)
clk_register_clkdev(clks[dev_clks->dt_id], dev_clks->con_id,
- dev_clks->dev_id);
+ "%s", dev_clks->dev_id);
for (i = 0; i < clk_num; i++) {
if (!IS_ERR_OR_NULL(clks[i]))
diff --git a/drivers/crypto/qat/qat_common/adf_transport_debug.c b/drivers/crypto/qat/qat_common/adf_transport_debug.c
index e41986967294..52340b9bb387 100644
--- a/drivers/crypto/qat/qat_common/adf_transport_debug.c
+++ b/drivers/crypto/qat/qat_common/adf_transport_debug.c
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
{
struct adf_etr_ring_data *ring = sfile->private;
struct adf_etr_bank_data *bank = ring->bank;
- uint32_t *msg = v;
void __iomem *csr = ring->bank->csr_addr;
- int i, x;
if (v == SEQ_START_TOKEN) {
int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
seq_puts(sfile, "----------- Ring data ------------\n");
return 0;
}
- seq_printf(sfile, "%p:", msg);
- x = 0;
- i = 0;
- for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
- seq_printf(sfile, " %08X", *(msg + i));
- if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
- (++x == 8)) {
- seq_printf(sfile, "\n%p:", msg + i + 1);
- x = 0;
- }
- }
- seq_puts(sfile, "\n");
+ seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
+ v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
return 0;
}
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 54071c148340..84533e02fbf8 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
config EFI_RUNTIME_MAP
bool "Export efi runtime maps to sysfs"
- depends on X86 && EFI && KEXEC
+ depends on X86 && EFI && KEXEC_CORE
default y
help
Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index aab49ee4ed40..103a5f6b969a 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data)
(void *)(unsigned long)virt_to_phys(high_memory));
list_for_each_entry(pt, &dev->vmalist, head) {
+ char lock_flag = '-';
+
vma = pt->vma;
if (!vma)
continue;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ lock_flag = 'f';
+ else if (vma->vm_flags & VM_LOCKED)
+ lock_flag = 'l';
seq_printf(m,
"\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000",
pt->pid,
@@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data)
vma->vm_flags & VM_WRITE ? 'w' : '-',
vma->vm_flags & VM_EXEC ? 'x' : '-',
vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
- vma->vm_flags & VM_LOCKED ? 'l' : '-',
+ lock_flag,
vma->vm_flags & VM_IO ? 'i' : '-',
vma->vm_pgoff);
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 04310cd35bc1..a4654e0c104d 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2181,7 +2181,7 @@ static int coda_probe(struct platform_device *pdev)
/* Get IRAM pool from device tree or platform data */
pool = of_gen_pool_get(np, "iram", 0);
if (!pool && pdata)
- pool = gen_pool_get(pdata->iram_dev);
+ pool = gen_pool_get(pdata->iram_dev, NULL);
if (!pool) {
dev_err(&pdev->dev, "iram pool not available\n");
return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
if (IS_ERR(sram->virt_base))
return PTR_ERR(sram->virt_base);
- sram->pool = devm_gen_pool_create(sram->dev,
- ilog2(SRAM_GRANULARITY), -1);
- if (!sram->pool)
- return -ENOMEM;
+ sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
+ NUMA_NO_NODE, NULL);
+ if (IS_ERR(sram->pool))
+ return PTR_ERR(sram->pool);
ret = sram_reserve_regions(sram, res);
if (ret)
diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 613ca2b2527b..d1a1e160ef31 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
.llseek = seq_lseek,
};
+static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
+ const char *prefix)
+{
+ seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
+}
+
static void wil_print_ring(struct seq_file *s, const char *prefix,
void __iomem *off)
{
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
le16_to_cpu(hdr.seq), len,
le16_to_cpu(hdr.type), hdr.flags);
if (len <= MAX_MBOXITEM_SIZE) {
- int n = 0;
- char printbuf[16 * 3 + 2];
unsigned char databuf[MAX_MBOXITEM_SIZE];
void __iomem *src = wmi_buffer(wil, d.addr) +
sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
* reading header
*/
wil_memcpy_fromio_32(databuf, src, len);
- while (n < len) {
- int l = min(len - n, 16);
-
- hex_dump_to_buffer(databuf + n, l,
- 16, 1, printbuf,
- sizeof(printbuf),
- false);
- seq_printf(s, " : %s\n", printbuf);
- n += l;
- }
+ wil_seq_hexdump(s, databuf, len, " : ");
}
} else {
seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
.open = simple_open,
};
-static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
- const char *prefix)
-{
- char printbuf[16 * 3 + 2];
- int i = 0;
-
- while (i < len) {
- int l = min(len - i, 16);
-
- hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
- sizeof(printbuf), false);
- seq_printf(s, "%s%s\n", prefix, printbuf);
- i += l;
- }
-}
-
static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
{
int i = 0;
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 02ff84fcfa61..957b42198328 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
struct ioc *ioc = ioc_list;
while (ioc != NULL) {
- u32 *res_ptr = (u32 *)ioc->res_map;
- int j;
-
- for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
- if ((j & 7) == 0)
- seq_puts(m, "\n ");
- seq_printf(m, "%08x", *res_ptr);
- res_ptr++;
- }
- seq_puts(m, "\n\n");
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+ ioc->res_size, false);
+ seq_putc(m, '\n');
ioc = ioc->next;
break; /* XXX - remove me */
}
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index f1441e466c06..225049b492e5 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
{
struct sba_device *sba_dev = sba_list;
struct ioc *ioc = &sba_dev->ioc[0]; /* FIXME: Multi-IOC support! */
- unsigned int *res_ptr = (unsigned int *)ioc->res_map;
- int i;
- for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
- if ((i & 7) == 0)
- seq_puts(m, "\n ");
- seq_printf(m, " %08x", *res_ptr);
- }
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+ ioc->res_size, false);
seq_putc(m, '\n');
return 0;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d37b5d19b9d9..0c72d2b689f1 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -465,7 +465,7 @@ static void pci_device_shutdown(struct device *dev)
if (drv && drv->shutdown)
drv->shutdown(pci_dev);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* If this is a kexec reboot, turn off Bus Master bit on the
* device to tell it to not continue to do DMA. Don't touch
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 01bf1f5cf2e9..4eb45546a3aa 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
static void sprinthx4(unsigned char *title, struct seq_file *m,
unsigned int *array, unsigned int len)
{
- int r;
-
seq_printf(m, "\n%s\n", title);
- for (r = 0; r < len; r++) {
- if ((r % 8) == 0)
- seq_printf(m, " ");
- seq_printf(m, "%08X ", array[r]);
- if ((r % 8) == 7)
- seq_putc(m, '\n');
- }
+ seq_hex_dump(m, " ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
seq_putc(m, '\n');
}
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b5b427888b24..95b330a9ea98 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,16 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = -1,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index e0dc0fee4ea6..4d115ffa626d 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -9,7 +9,7 @@ config VGA_CONSOLE
depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
!SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
(!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
- !ARM64 && !MICROBLAZE
+ !ARM64 && !MICROBLAZE && !ARC
default y
help
Saying Y here will allow you to use Linux in text mode through a
diff --git a/drivers/w1/masters/omap_hdq.c b/drivers/w1/masters/omap_hdq.c
index e7d448963a24..0e2f43bccf1f 100644
--- a/drivers/w1/masters/omap_hdq.c
+++ b/drivers/w1/masters/omap_hdq.c
@@ -17,6 +17,7 @@
#include <linux/io.h>
#include <linux/sched.h>
#include <linux/pm_runtime.h>
+#include <linux/of.h>
#include "../w1.h"
#include "../w1_int.h"
@@ -27,21 +28,23 @@
#define OMAP_HDQ_TX_DATA 0x04
#define OMAP_HDQ_RX_DATA 0x08
#define OMAP_HDQ_CTRL_STATUS 0x0c
-#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK (1<<6)
-#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE (1<<5)
-#define OMAP_HDQ_CTRL_STATUS_GO (1<<4)
-#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION (1<<2)
-#define OMAP_HDQ_CTRL_STATUS_DIR (1<<1)
-#define OMAP_HDQ_CTRL_STATUS_MODE (1<<0)
+#define OMAP_HDQ_CTRL_STATUS_SINGLE BIT(7)
+#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK BIT(6)
+#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE BIT(5)
+#define OMAP_HDQ_CTRL_STATUS_GO BIT(4)
+#define OMAP_HDQ_CTRL_STATUS_PRESENCE BIT(3)
+#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION BIT(2)
+#define OMAP_HDQ_CTRL_STATUS_DIR BIT(1)
#define OMAP_HDQ_INT_STATUS 0x10
-#define OMAP_HDQ_INT_STATUS_TXCOMPLETE (1<<2)
-#define OMAP_HDQ_INT_STATUS_RXCOMPLETE (1<<1)
-#define OMAP_HDQ_INT_STATUS_TIMEOUT (1<<0)
+#define OMAP_HDQ_INT_STATUS_TXCOMPLETE BIT(2)
+#define OMAP_HDQ_INT_STATUS_RXCOMPLETE BIT(1)
+#define OMAP_HDQ_INT_STATUS_TIMEOUT BIT(0)
#define OMAP_HDQ_SYSCONFIG 0x14
-#define OMAP_HDQ_SYSCONFIG_SOFTRESET (1<<1)
-#define OMAP_HDQ_SYSCONFIG_AUTOIDLE (1<<0)
+#define OMAP_HDQ_SYSCONFIG_SOFTRESET BIT(1)
+#define OMAP_HDQ_SYSCONFIG_AUTOIDLE BIT(0)
+#define OMAP_HDQ_SYSCONFIG_NOIDLE 0x0
#define OMAP_HDQ_SYSSTATUS 0x18
-#define OMAP_HDQ_SYSSTATUS_RESETDONE (1<<0)
+#define OMAP_HDQ_SYSSTATUS_RESETDONE BIT(0)
#define OMAP_HDQ_FLAG_CLEAR 0
#define OMAP_HDQ_FLAG_SET 1
@@ -67,6 +70,10 @@ struct hdq_data {
* the data wrire or read.
*/
int init_trans;
+ int rrw;
+ /* mode: 0-HDQ 1-W1 */
+ int mode;
+
};
static int omap_hdq_probe(struct platform_device *pdev);
@@ -74,6 +81,7 @@ static int omap_hdq_remove(struct platform_device *pdev);
static const struct of_device_id omap_hdq_dt_ids[] = {
{ .compatible = "ti,omap3-1w" },
+ { .compatible = "ti,am4372-hdq" },
{}
};
MODULE_DEVICE_TABLE(of, omap_hdq_dt_ids);
@@ -90,15 +98,12 @@ static struct platform_driver omap_hdq_driver = {
static u8 omap_w1_read_byte(void *_hdq);
static void omap_w1_write_byte(void *_hdq, u8 byte);
static u8 omap_w1_reset_bus(void *_hdq);
-static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
- u8 search_type, w1_slave_found_callback slave_found);
static struct w1_bus_master omap_w1_master = {
.read_byte = omap_w1_read_byte,
.write_byte = omap_w1_write_byte,
.reset_bus = omap_w1_reset_bus,
- .search = omap_w1_search_bus,
};
/* HDQ register I/O routines */
@@ -122,6 +127,15 @@ static inline u8 hdq_reg_merge(struct hdq_data *hdq_data, u32 offset,
return new_val;
}
+static void hdq_disable_interrupt(struct hdq_data *hdq_data, u32 offset,
+ u32 mask)
+{
+ u32 ie;
+
+ ie = readl(hdq_data->hdq_base + offset);
+ writel(ie & mask, hdq_data->hdq_base + offset);
+}
+
/*
* Wait for one or more bits in flag change.
* HDQ_FLAG_SET: wait until any bit in the flag is set.
@@ -229,13 +243,7 @@ static irqreturn_t hdq_isr(int irq, void *_hdq)
return IRQ_HANDLED;
}
-/* HDQ Mode: always return success */
-static u8 omap_w1_reset_bus(void *_hdq)
-{
- return 0;
-}
-
-/* W1 search callback function */
+/* W1 search callback function in HDQ mode */
static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
u8 search_type, w1_slave_found_callback slave_found)
{
@@ -262,9 +270,10 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
int ret;
u8 tmp_status;
- hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG, OMAP_HDQ_SYSCONFIG_SOFTRESET);
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_SOFTRESET);
/*
- * Select HDQ mode & enable clocks.
+ * Select HDQ/1W mode & enable clocks.
* It is observed that INT flags can't be cleared via a read and GO/INIT
* won't return to zero if interrupt is disabled. So we always enable
* interrupt.
@@ -282,7 +291,8 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
else {
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
OMAP_HDQ_SYSCONFIG_AUTOIDLE);
}
@@ -334,6 +344,18 @@ static int omap_hdq_break(struct hdq_data *hdq_data)
ret = -ETIMEDOUT;
goto out;
}
+
+ /*
+ * check for the presence detect bit to get
+ * set to show that the slave is responding
+ */
+ if (!(hdq_reg_in(hdq_data, OMAP_HDQ_CTRL_STATUS) &
+ OMAP_HDQ_CTRL_STATUS_PRESENCE)) {
+ dev_dbg(hdq_data->dev, "Presence bit not set\n");
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
/*
* wait for both INIT and GO bits rerurn to zero.
* zero wait time expected for interrupt mode.
@@ -368,6 +390,8 @@ static int hdq_read_byte(struct hdq_data *hdq_data, u8 *val)
goto out;
}
+ hdq_data->hdq_irqstatus = 0;
+
if (!(hdq_data->hdq_irqstatus & OMAP_HDQ_INT_STATUS_RXCOMPLETE)) {
hdq_reg_merge(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_DIR | OMAP_HDQ_CTRL_STATUS_GO,
@@ -400,7 +424,7 @@ rtn:
}
-/* Enable clocks and set the controller to HDQ mode */
+/* Enable clocks and set the controller to HDQ/1W mode */
static int omap_hdq_get(struct hdq_data *hdq_data)
{
int ret = 0;
@@ -422,7 +446,7 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
pm_runtime_get_sync(hdq_data->dev);
- /* make sure HDQ is out of reset */
+ /* make sure HDQ/1W is out of reset */
if (!(hdq_reg_in(hdq_data, OMAP_HDQ_SYSSTATUS) &
OMAP_HDQ_SYSSTATUS_RESETDONE)) {
ret = _omap_hdq_reset(hdq_data);
@@ -430,12 +454,13 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
/* back up the count */
hdq_data->hdq_usecount--;
} else {
- /* select HDQ mode & enable clocks */
+ /* select HDQ/1W mode & enable clocks */
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
- OMAP_HDQ_SYSCONFIG_AUTOIDLE);
+ OMAP_HDQ_SYSCONFIG_NOIDLE);
hdq_reg_in(hdq_data, OMAP_HDQ_INT_STATUS);
}
}
@@ -456,6 +481,8 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
if (ret < 0)
return -EINTR;
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_AUTOIDLE);
if (0 == hdq_data->hdq_usecount) {
dev_dbg(hdq_data->dev, "attempt to decrement use count"
" when it is zero");
@@ -471,6 +498,100 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
return ret;
}
+/*
+ * W1 triplet callback function - used for searching ROM addresses.
+ * Registered only when controller is in 1-wire mode.
+ */
+static u8 omap_w1_triplet(void *_hdq, u8 bdir)
+{
+ u8 id_bit, comp_bit;
+ int err;
+ u8 ret = 0x3; /* no slaves responded */
+ struct hdq_data *hdq_data = _hdq;
+ u8 ctrl = OMAP_HDQ_CTRL_STATUS_SINGLE | OMAP_HDQ_CTRL_STATUS_GO |
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK;
+ u8 mask = ctrl | OMAP_HDQ_CTRL_STATUS_DIR;
+
+ omap_hdq_get(_hdq);
+
+ err = mutex_lock_interruptible(&hdq_data->hdq_mutex);
+ if (err < 0) {
+ dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
+ goto rtn;
+ }
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read id_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ id_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read comp_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ comp_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ if (id_bit && comp_bit) {
+ ret = 0x03; /* no slaves responded */
+ goto out;
+ }
+ if (!id_bit && !comp_bit) {
+ /* Both bits are valid, take the direction given */
+ ret = bdir ? 0x04 : 0;
+ } else {
+ /* Only one bit is valid, take that direction */
+ bdir = id_bit;
+ ret = id_bit ? 0x05 : 0x02;
+ }
+
+ /* write bdir bit */
+ hdq_reg_out(_hdq, OMAP_HDQ_TX_DATA, bdir);
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, ctrl, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_TXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "TX wait elapsed\n");
+ goto out;
+ }
+
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, 0,
+ OMAP_HDQ_CTRL_STATUS_SINGLE);
+
+out:
+ mutex_unlock(&hdq_data->hdq_mutex);
+rtn:
+ omap_hdq_put(_hdq);
+ return ret;
+}
+
+/* reset callback */
+static u8 omap_w1_reset_bus(void *_hdq)
+{
+ omap_hdq_get(_hdq);
+ omap_hdq_break(_hdq);
+ omap_hdq_put(_hdq);
+ return 0;
+}
+
/* Read a byte of data from the device */
static u8 omap_w1_read_byte(void *_hdq)
{
@@ -478,6 +599,10 @@ static u8 omap_w1_read_byte(void *_hdq)
u8 val = 0;
int ret;
+ /* First write to initialize the transfer */
+ if (hdq_data->init_trans == 0)
+ omap_hdq_get(hdq_data);
+
ret = hdq_read_byte(hdq_data, &val);
if (ret) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -491,6 +616,10 @@ static u8 omap_w1_read_byte(void *_hdq)
return -1;
}
+ hdq_disable_interrupt(hdq_data, OMAP_HDQ_CTRL_STATUS,
+ ~OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ hdq_data->hdq_usecount = 0;
+
/* Write followed by a read, release the module */
if (hdq_data->init_trans) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -517,6 +646,14 @@ static void omap_w1_write_byte(void *_hdq, u8 byte)
if (hdq_data->init_trans == 0)
omap_hdq_get(hdq_data);
+ /*
+ * We need to reset the slave before
+ * issuing the SKIP ROM command, else
+ * the slave will not work.
+ */
+ if (byte == W1_SKIP_ROM)
+ omap_hdq_break(hdq_data);
+
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
if (ret < 0) {
dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
@@ -551,6 +688,7 @@ static int omap_hdq_probe(struct platform_device *pdev)
struct resource *res;
int ret, irq;
u8 rev;
+ const char *mode;
hdq_data = devm_kzalloc(dev, sizeof(*hdq_data), GFP_KERNEL);
if (!hdq_data) {
@@ -567,10 +705,21 @@ static int omap_hdq_probe(struct platform_device *pdev)
return PTR_ERR(hdq_data->hdq_base);
hdq_data->hdq_usecount = 0;
+ hdq_data->rrw = 0;
mutex_init(&hdq_data->hdq_mutex);
pm_runtime_enable(&pdev->dev);
- pm_runtime_get_sync(&pdev->dev);
+ ret = pm_runtime_get_sync(&pdev->dev);
+ if (ret < 0) {
+ dev_dbg(&pdev->dev, "pm_runtime_get_sync failed\n");
+ goto err_w1;
+ }
+
+ ret = _omap_hdq_reset(hdq_data);
+ if (ret) {
+ dev_dbg(&pdev->dev, "reset failed\n");
+ return -EINVAL;
+ }
rev = hdq_reg_in(hdq_data, OMAP_HDQ_REVISION);
dev_info(&pdev->dev, "OMAP HDQ Hardware Rev %c.%c. Driver in %s mode\n",
@@ -594,6 +743,15 @@ static int omap_hdq_probe(struct platform_device *pdev)
pm_runtime_put_sync(&pdev->dev);
+ ret = of_property_read_string(pdev->dev.of_node, "ti,mode", &mode);
+ if (ret < 0 || !strcmp(mode, "hdq")) {
+ hdq_data->mode = 0;
+ omap_w1_master.search = omap_w1_search_bus;
+ } else {
+ hdq_data->mode = 1;
+ omap_w1_master.triplet = omap_w1_triplet;
+ }
+
omap_w1_master.data = hdq_data;
ret = w1_add_master_device(&omap_w1_master);
@@ -635,8 +793,8 @@ static int omap_hdq_remove(struct platform_device *pdev)
module_platform_driver(omap_hdq_driver);
module_param(w1_id, int, S_IRUSR);
-MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection");
+MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection in HDQ mode");
MODULE_AUTHOR("Texas Instruments");
-MODULE_DESCRIPTION("HDQ driver Library");
+MODULE_DESCRIPTION("HDQ-1W driver Library");
MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..4204db232fe0 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -116,7 +116,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
substring_t args[MAX_OPT_ARGS];
char *p;
int option = 0;
- char *s, *e;
+ char *s;
int ret = 0;
/* setup defaults */
@@ -269,8 +269,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
} else {
uid_t uid;
v9ses->flags |= V9FS_ACCESS_SINGLE;
- uid = simple_strtoul(s, &e, 10);
- if (*e != '\0') {
+ ret = parse_integer(s, 10, &uid);
+ if (ret < 0)
+ return ret;
+ if (s[ret] != '\0') {
ret = -EINVAL;
pr_info("Unknown access argument %s\n",
s);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3abc447783aa..6b747394f6f5 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -231,7 +231,8 @@ out_unlock:
if (res < 0 && fl->fl_type != F_UNLCK) {
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
- res = posix_lock_file_wait(filp, fl);
+ /* Even if this fails we want to return the remote error */
+ posix_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
out:
diff --git a/fs/Makefile b/fs/Makefile
index 2c8b181bed3c..b4406d6f7da1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
+ struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
return res;
}
+static const struct vm_operations_struct aio_ring_vm_ops = {
+ .mremap = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_ops = &aio_ring_vm_ops;
+ return 0;
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
- .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37847..75df4264176d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -47,7 +47,7 @@ enum {Enabled, Magic};
typedef struct {
struct list_head list;
unsigned long flags; /* type, status, etc. */
- int offset; /* offset of magic */
+ unsigned int offset; /* offset of magic */
int size; /* size of magic/mask */
char *magic; /* magic or filename extension */
char *mask; /* mask, NULL for exact match */
@@ -370,7 +370,13 @@ static Node *create_entry(const char __user *buffer, size_t count)
if (!s)
goto einval;
*s++ = '\0';
- e->offset = simple_strtoul(p, &p, 10);
+ err = parse_integer(p, 10, &e->offset);
+ if (err < 0) {
+ kfree(e);
+ goto out;
+
+ }
+ p += err;
if (*p++)
goto einval;
pr_debug("register: offset: %#x\n", e->offset);
@@ -548,7 +554,7 @@ static void entry_status(Node *e, char *page)
if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
} else {
- dp += sprintf(dp, "offset %i\nmagic ", e->offset);
+ dp += sprintf(dp, "offset %u\nmagic ", e->offset);
dp = bin2hex(dp, e->magic, e->size);
if (e->mask) {
dp += sprintf(dp, "\nmask ");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0e2edad4b316..5e9bf3411b70 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def05bdf..cc1a9210f1a6 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -326,14 +326,15 @@ static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
*/
static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
{
- unsigned long frun;
+ unsigned int frun;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- frun = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &frun);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -350,14 +351,15 @@ static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
{
- unsigned long fcull;
+ unsigned int fcull;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- fcull = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &fcull);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -374,14 +376,15 @@ static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
{
- unsigned long fstop;
+ unsigned int fstop;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- fstop = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &fstop);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -398,14 +401,15 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
{
- unsigned long brun;
+ unsigned int brun;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- brun = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &brun);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -422,14 +426,15 @@ static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
{
- unsigned long bcull;
+ unsigned int bcull;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- bcull = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &bcull);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -446,14 +451,15 @@ static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
*/
static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
{
- unsigned long bstop;
+ unsigned int bstop;
+ int rv;
_enter(",%s", args);
- if (!*args)
- return -EINVAL;
-
- bstop = simple_strtoul(args, &args, 10);
+ rv = parse_integer(args, 10, &bstop);
+ if (rv < 0)
+ return rv;
+ args += rv;
if (args[0] != '%' || args[1] != '\0')
return -EINVAL;
@@ -601,21 +607,21 @@ inval:
*/
static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
{
- unsigned long mask;
+ unsigned int mask;
+ int rv;
_enter(",%s", args);
- mask = simple_strtoul(args, &args, 0);
- if (args[0] != '\0')
- goto inval;
-
+ rv = parse_integer(args, 0, &mask);
+ if (rv < 0)
+ return rv;
+ if (args[rv] != '\0') {
+ pr_err("debug command requires mask\n");
+ return -EINVAL;
+ }
cachefiles_debug = mask;
_leave(" = 0");
return 0;
-
-inval:
- pr_err("debug command requires mask\n");
- return -EINVAL;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..7b6bfcbf801c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ seq_show_option(m, "snapdirname", fsopt->snapdir_name);
return 0;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
- seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+ seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
seq_puts(s, ",multiuser");
else if (tcon->ses->user_name)
- seq_printf(s, ",username=%s", tcon->ses->user_name);
+ seq_show_option(s, "username", tcon->ses->user_name);
if (tcon->ses->domainName)
- seq_printf(s, ",domain=%s", tcon->ses->domainName);
+ seq_show_option(s, "domain", tcon->ses->domainName);
if (srcaddr->sa_family != AF_UNSPEC) {
struct sockaddr_in *saddr4;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9..2ac2d8471393 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3390,13 +3390,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, GFP_KERNEL);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3417,10 +3417,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index,
GFP_KERNEL)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9b1ffaa0572e..f6c6c8adbc01 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
char *result;
insize = max_t(unsigned int,
- INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+ INSIZE(readlink), OUTSIZE(readlink)+ *length);
UPARG(CODA_READLINK);
inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
if (!error) {
retlen = outp->coda_readlink.count;
- if ( retlen > *length )
- retlen = *length;
+ if (retlen >= *length)
+ retlen = *length - 1;
*length = retlen;
result = (char *)outp + (long)outp->coda_readlink.data;
memcpy(buffer, result, retlen);
diff --git a/fs/coredump.c b/fs/coredump.c
index bd6bcc68bdd5..53d7d46c55c8 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -515,10 +515,10 @@ void do_coredump(const siginfo_t *siginfo)
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
- int flag = 0;
int ispipe;
struct files_struct *displaced;
- bool need_nonrelative = false;
+ /* require nonrelative corefile path and be extra careful */
+ bool need_suid_safe = false;
bool core_dumped = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
@@ -552,9 +552,8 @@ void do_coredump(const siginfo_t *siginfo)
*/
if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
/* Setuid core dump mode */
- flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_nonrelative = true;
+ need_suid_safe = true;
}
retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -635,7 +634,7 @@ void do_coredump(const siginfo_t *siginfo)
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
- if (need_nonrelative && cn.corename[0] != '/') {
+ if (need_suid_safe && cn.corename[0] != '/') {
printk(KERN_WARNING "Pid %d(%s) can only dump core "\
"to fully qualified path!\n",
task_tgid_vnr(current), current->comm);
@@ -643,8 +642,35 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_unlock;
}
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!need_suid_safe) {
+ mm_segment_t old_fs;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ (void) sys_unlink((const char __user *)cn.corename);
+ set_fs(old_fs);
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+ O_CREAT | 2 | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL,
0600);
if (IS_ERR(cprm.file))
goto fail_unlock;
@@ -661,11 +687,15 @@ void do_coredump(const siginfo_t *siginfo)
if (!S_ISREG(inode->i_mode))
goto close_fail;
/*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files.
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
+ if ((inode->i_mode & 0677) != 0600)
+ goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
diff --git a/fs/dax.c b/fs/dax.c
index 57bb70b4af70..e43389c74bbc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -283,7 +283,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
void __pmem *addr;
@@ -291,8 +290,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
pgoff_t size;
int error;
- i_mmap_lock_read(mapping);
-
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -322,8 +319,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = vm_insert_mixed(vma, vaddr, pfn);
out:
- i_mmap_unlock_read(mapping);
-
return error;
}
@@ -385,15 +380,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock_page;
+ goto unlock;
}
+ } else {
+ i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
+ goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -404,8 +401,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock_page;
+ goto unlock;
} else {
+ i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -417,17 +415,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
+ goto unlock;
vmf->page = page;
if (!page) {
- i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
error = -EIO;
- goto out;
+ goto unlock;
}
}
return VM_FAULT_LOCKED;
@@ -463,6 +459,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ if (!page)
+ i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -471,11 +469,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock_page:
+ unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
+ } else {
+ i_mmap_unlock_write(mapping);
}
+
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -507,6 +508,176 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ i_mmap_lock_write(mapping);
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_page(kaddr + i * PAGE_SIZE);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_write(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_write(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ i_mmap_unlock_write(mapping);
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c33aeb0f68f..0001dea1d12b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3375,7 +3375,7 @@ static int __init set_dhash_entries(char *str)
{
if (!str)
return 0;
- dhash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &dhash_entries);
return 1;
}
__setup("dhash_entries=", set_dhash_entries);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..d317441fe809 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -84,6 +84,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
substring_t args[MAX_OPT_ARGS];
int option;
bool s_pid = false;
+ int rv;
EXOFS_DBGMSG("parse_options %s\n", options);
/* defaults */
@@ -92,7 +93,6 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token;
- char str[32];
if (!*p)
continue;
@@ -108,9 +108,11 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
opts->is_osdname = true;
break;
case Opt_pid:
- if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+ rv = parse_integer(args[0].from, 0, &opts->pid);
+ if (rv < 0)
+ return rv;
+ if (args[0].from[rv] != '\0')
return -EINVAL;
- opts->pid = simple_strtoull(str, NULL, 0);
if (opts->pid < EXOFS_MIN_PID) {
EXOFS_ERR("Partition ID must be >= %u",
EXOFS_MIN_PID);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..1982c3f11aec 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19cf9ef6..a08ac730a38f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -383,16 +383,18 @@ static unsigned long get_sb_block(void **data)
{
unsigned long sb_block;
char *options = (char *) *data;
+ int rv;
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
options += 3;
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
+ rv = parse_integer(options, 0, &sb_block);
+ if (rv < 0 || (options[rv] && options[rv] != ',')) {
printk("EXT2-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
+ options += rv;
if (*options == ',')
options++;
*data = (void *) options;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32071f5c1c26..fd1f28be5296 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..113837e7ba98 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7c2cab..612fbcf76b5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..0b9efa905b5f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1251,18 +1251,19 @@ static ext4_fsblk_t get_sb_block(void **data)
{
ext4_fsblk_t sb_block;
char *options = (char *) *data;
+ int rv;
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
options += 3;
- /* TODO: use simple_strtoll with >32bit ext4 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
+ rv = parse_integer(options, 0, &sb_block);
+ if (rv < 0 || (options[rv] && options[rv] != ',')) {
printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
+ options += rv;
if (*options == ',')
options++;
*data = (void *) options;
@@ -1776,10 +1777,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
}
if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
#endif
}
@@ -2532,10 +2533,10 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned long t;
+ unsigned int t;
int ret;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
@@ -2559,13 +2560,11 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- unsigned long t;
int ret;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, ui);
if (ret)
return ret;
- *ui = t;
return count;
}
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e41..4c71c8c76426 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..4307cd4f8da0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..43d3475da83a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,8 +14,12 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..d04c87da4255 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -553,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
if (is_ancestor(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
if (args->ar_lockproto[0])
- seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+ seq_show_option(s, "lockproto", args->ar_lockproto);
if (args->ar_locktable[0])
- seq_printf(s, ",locktable=%s", args->ar_locktable);
+ seq_show_option(s, "locktable", args->ar_locktable);
if (args->ar_hostdata[0])
- seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+ seq_show_option(s, "hostdata", args->ar_hostdata);
if (args->ar_spectator)
seq_puts(s, ",spectator");
if (args->ar_localflocks)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503e..221719eac5de 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -398,11 +397,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f52026..6fc766df0461 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+ seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
seq_printf(seq, ",uid=%u,gid=%u",
from_kuid_munged(&init_user_ns, sbi->s_uid),
from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708fd9331..63924662aaf3 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
if (sbi->type != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+ seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
from_kuid_munged(&init_user_ns, sbi->uid),
from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
- seq_printf(seq, ",%s", root_path + offset);
+ seq_show_option(seq, root_path + offset, NULL);
if (append)
seq_puts(seq, ",append");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24ce59ad..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8859e1..ed616f1250d7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1840,7 +1840,7 @@ static int __init set_ihash_entries(char *str)
{
if (!str)
return 0;
- ihash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &ihash_entries);
return 1;
}
__setup("ihash_entries=", set_ihash_entries);
diff --git a/fs/libfs.c b/fs/libfs.c
index c7cbfb092e94..b05298d13a86 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -747,7 +747,6 @@ struct simple_attr {
int (*get)(void *, u64 *);
int (*set)(void *, u64);
char get_buf[24]; /* enough to store a u64 and "\n\0" */
- char set_buf[24];
void *data;
const char *fmt; /* format for read operation */
struct mutex mutex; /* protects access to these buffers */
@@ -825,31 +824,26 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
- u64 val;
- size_t size;
- ssize_t ret;
+ s64 val;
+ int ret;
attr = file->private_data;
if (!attr->set)
return -EACCES;
+ ret = kstrtos64_from_user(buf, len, 0, &val);
+ if (ret < 0)
+ return ret;
+
ret = mutex_lock_interruptible(&attr->mutex);
if (ret)
return ret;
-
- ret = -EFAULT;
- size = min(sizeof(attr->set_buf) - 1, len);
- if (copy_from_user(attr->set_buf, buf, size))
- goto out;
-
- attr->set_buf[size] = '\0';
- val = simple_strtoll(attr->set_buf, NULL, 0);
ret = attr->set(attr->data, val);
- if (ret == 0)
- ret = len; /* on success, claim we got the whole input */
-out:
mutex_unlock(&attr->mutex);
- return ret;
+ if (ret < 0)
+ return ret;
+ /* on success, claim we got the whole input */
+ return len;
}
EXPORT_SYMBOL_GPL(simple_attr_write);
diff --git a/fs/mpage.c b/fs/mpage.c
index 778a4ddef77a..2ebf91652ecb 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -617,7 +618,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -627,7 +628,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -639,7 +640,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0570729c87fd..68d7c07c3c2e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -37,7 +37,7 @@ static int __init set_mhash_entries(char *str)
{
if (!str)
return 0;
- mhash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &mhash_entries);
return 1;
}
__setup("mhash_entries=", set_mhash_entries);
@@ -47,7 +47,7 @@ static int __init set_mphash_entries(char *str)
{
if (!str)
return 0;
- mphash_entries = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, &mphash_entries);
return 1;
}
__setup("mphash_entries=", set_mphash_entries);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct inode *inode;
+ bool free = false;
inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
/* nothing else could have found us thanks to the dnotify_groups
mark_mutex */
- if (dn_mark->dn == NULL)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+ if (dn_mark->dn == NULL) {
+ fsnotify_detach_mark(fsn_mark);
+ free = true;
+ }
mutex_unlock(&dnotify_group->mark_mutex);
+ if (free)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
}
@@ -362,9 +367,10 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&dnotify_group->mark_mutex);
+ if (destroy)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+ !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
/*
* Clear all of the marks on an inode when it is being evicted from core
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
mnt = NULL;
/*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+ * be expensive. It protects walking the *_fsnotify_marks lists.
+ * However, if we do not walk the lists, we do not have to do
+ * SRCU because we have no references to any objects and do not
+ * need SRCU to keep them "alive".
+ */
+ if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+ (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ return 0;
+ /*
* if this is a modify event we may need to clear the ignored masks
* otherwise return if neither the inode nor the vfsmount care about
* this type of event.
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
#include <linux/srcu.h>
#include <linux/types.h>
+#include "../mount.h"
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
/* inode specific destruction of a mark */
extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+ fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+ fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+ &mnt->mnt_root->d_lock);
+}
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a4e1a8f6c329..e785fd954c30 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
}
/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- LIST_HEAD(free_list);
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&inode->i_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
-
-/*
* Given a group clear all of the inode marks associated with that group.
*/
void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
}
/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
*/
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
struct inode *inode = NULL;
+ struct fsnotify_group *group = mark->group;
BUG_ON(!mutex_is_locked(&group->mark_mutex));
spin_lock(&mark->lock);
/* something else already called this function on this mark */
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&mark->lock);
return;
}
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
fsnotify_destroy_vfsmount_mark(mark);
else
BUG();
+ /*
+ * Note that we didn't update flags telling whether inode cares about
+ * what's happening with children. We update these flags from
+ * __fsnotify_parent() lazily when next event happens on one of our
+ * children.
+ */
list_del_init(&mark->g_list);
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
iput(inode);
- /* release lock temporarily */
- mutex_unlock(&group->mark_mutex);
+
+ atomic_dec(&group->num_marks);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_group *group = mark->group;
+
+ spin_lock(&mark->lock);
+ /* something else already called this function on this mark */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ spin_unlock(&mark->lock);
+ return;
+ }
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ spin_unlock(&mark->lock);
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
wake_up(&destroy_waitq);
- /*
- * We don't necessarily have a ref on mark from caller so the above destroy
- * may have actually freed it, unless this group provides a 'freeing_mark'
- * function which must be holding a reference.
- */
/*
* Some groups like to know that marks are being freed. This is a
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
-
- /*
- * __fsnotify_update_child_dentry_flags(inode);
- *
- * I really want to call that, but we can't, we have no idea if the inode
- * still exists the second we drop the mark->lock.
- *
- * The next time an event arrive to this inode from one of it's children
- * __fsnotify_parent will see that the inode doesn't care about it's
- * children and will update all of these flags then. So really this
- * is just a lazy update (and could be a perf win...)
- */
-
- atomic_dec(&group->num_marks);
-
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
}
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
{
- struct fsnotify_mark *mark, *lmark;
- struct fsnotify_group *group;
-
- list_for_each_entry_safe(mark, lmark, to_free, free_list) {
- spin_lock(&mark->lock);
- fsnotify_get_group(mark->group);
- group = mark->group;
- spin_unlock(&mark->lock);
+ struct fsnotify_mark *mark;
- fsnotify_destroy_mark(mark, group);
+ while (1) {
+ /*
+ * We have to be careful since we can race with e.g.
+ * fsnotify_clear_marks_by_group() and once we drop 'lock',
+ * mark can get removed from the obj_list and destroyed. But
+ * we are holding mark reference so mark cannot be freed and
+ * calling fsnotify_destroy_mark() more than once is fine.
+ */
+ spin_lock(lock);
+ if (hlist_empty(head)) {
+ spin_unlock(lock);
+ break;
+ }
+ mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+ /*
+ * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+ * since inode / mount is going away anyway. So just remove
+ * mark from the list.
+ */
+ hlist_del_init_rcu(&mark->obj_list);
+ fsnotify_get_mark(mark);
+ spin_unlock(lock);
+ fsnotify_destroy_mark(mark, mark->group);
fsnotify_put_mark(mark);
- fsnotify_put_group(group);
}
}
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
* inode->i_lock
*/
spin_lock(&mark->lock);
- mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
fsnotify_get_group(group);
mark->group = group;
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- struct mount *m = real_mount(mnt);
- LIST_HEAD(free_list);
-
- spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&mnt->mnt_root->d_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c1128bcbeb5e..d1a853585b53 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
return true;
#ifdef NTFS_RW
iput_usnjrnl_err_out:
- if (vol->usnjrnl_j_ino)
- iput(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- iput(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- iput(vol->usnjrnl_ino);
+ iput(vol->usnjrnl_j_ino);
+ iput(vol->usnjrnl_max_ino);
+ iput(vol->usnjrnl_ino);
iput_quota_err_out:
- if (vol->quota_q_ino)
- iput(vol->quota_q_ino);
- if (vol->quota_ino)
- iput(vol->quota_ino);
+ iput(vol->quota_q_ino);
+ iput(vol->quota_ino);
iput(vol->extend_ino);
#endif /* NTFS_RW */
iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
iput(vol->root_ino);
iput_logfile_err_out:
#ifdef NTFS_RW
- if (vol->logfile_ino)
- iput(vol->logfile_ino);
+ iput(vol->logfile_ino);
iput_vol_err_out:
#endif /* NTFS_RW */
iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
iput(vol->mftbmp_ino);
iput_mirr_err_out:
#ifdef NTFS_RW
- if (vol->mftmirr_ino)
- iput(vol->mftmirr_ino);
+ iput(vol->mftmirr_ino);
#endif /* NTFS_RW */
return false;
}
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ struct buffer_head *bh = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
+ return status;
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int ret;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
-
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
return ERR_PTR(ret);
+ }
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
-
return acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..0afb4cb7ce1b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2526,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2872,8 +2847,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2967,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3040,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3131,6 +3093,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3200,7 +3186,7 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
+ "Owner %llu has empty extent block at %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
@@ -3628,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3666,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3714,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3735,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3770,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -3930,7 +3960,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
@@ -4355,10 +4385,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
@@ -4413,8 +4440,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
@@ -4970,10 +4996,9 @@ leftright:
split_index = ocfs2_search_extent_list(el, cpos);
if (split_index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u "
- "which can no longer be found.\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5158,10 +5183,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5228,9 +5252,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5337,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5514,8 +5545,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5580,7 +5610,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5596,8 +5626,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -5925,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5955,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6013,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -7111,12 +7137,20 @@ start:
ocfs2_error(inode->i_sb, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7204,8 +7238,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9db8194..64b11d90eca6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
alloc_locked = 1;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extend_allocation(inode, cpos,
clusters_to_alloc, 0);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(ret);
goto bail;
}
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
/*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
- ocfs2_iocb_clear_rw_locked(iocb);
+ /* Let rw unlock to be done later to protect append direct io write */
+ if (offset + bytes <= i_size_read(inode)) {
+ ocfs2_iocb_clear_rw_locked(iocb);
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ }
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
/* zeroing out the previously allocated cluster tail
* that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
zero_len_tail, cluster_align_tail);
- else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ }
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
offset, ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
- if (unlikely(written < 0)) {
+ /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+ if ((written < 0) && (written != -EIOCBQUEUED)) {
loff_t i_size = i_size_read(inode);
if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
goto clean_orphan;
}
}
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
ret = jbd2_journal_force_commit(journal);
if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
if (tmp_ret < 0) {
ret = tmp_ret;
mlog_errno(ret);
+ brelse(di_bh);
goto out;
}
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
* lock, or it will cause a deadlock since journal commit threads holds
* this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 140de3c93d2e..0c154710249b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1483,13 +1452,12 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
unsigned long *ret_bytes,
unsigned int *ret_bits)
{
- unsigned long bytes;
- char *p = (char *)page;
-
- bytes = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int bytes;
+ int rv;
+ rv = kstrtouint(page, 0, &bytes);
+ if (rv < 0)
+ return rv;
/* Heartbeat and fs min / max block sizes are the same. */
if (bytes > 4096 || bytes < 512)
return -ERANGE;
@@ -1542,18 +1510,14 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
const char *page,
size_t count)
{
- unsigned long long tmp;
- char *p = (char *)page;
+ int rv;
if (reg->hr_bdev)
return -EINVAL;
- tmp = simple_strtoull(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
- reg->hr_start_block = tmp;
-
+ rv = kstrtoull(page, 0, &reg->hr_start_block);
+ if (rv < 0)
+ return rv;
return count;
}
@@ -1567,20 +1531,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
const char *page,
size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
+ unsigned int tmp;
+ int rv;
if (reg->hr_bdev)
return -EINVAL;
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
+ rv = kstrtouint(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp > O2NM_MAX_NODES || tmp == 0)
return -ERANGE;
- reg->hr_blocks = (unsigned int)tmp;
+ reg->hr_blocks = tmp;
return count;
}
@@ -1619,17 +1582,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
- if (reg->hr_tmp_block == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_tmp_block == NULL)
return -ENOMEM;
- }
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
- if (reg->hr_slots == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_blocks; i++) {
slot = &reg->hr_slots[i];
@@ -1645,17 +1604,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
GFP_KERNEL);
- if (!reg->hr_slot_data) {
- mlog_errno(-ENOMEM);
+ if (!reg->hr_slot_data)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_num_pages; i++) {
page = alloc_page(GFP_KERNEL);
- if (!page) {
- mlog_errno(-ENOMEM);
+ if (!page)
return -ENOMEM;
- }
reg->hr_slot_data[i] = page;
@@ -1687,10 +1642,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_heartbeat_block *hb_block;
ret = o2hb_read_slots(reg, reg->hr_blocks);
- if (ret) {
- mlog_errno(ret);
+ if (ret)
goto out;
- }
/* We only want to get an idea of the values initially in each
* slot, so we do no verification - o2hb_check_slot will
@@ -1716,9 +1669,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
size_t count)
{
struct task_struct *hb_task;
- long fd;
+ int fd;
int sectsize;
- char *p = (char *)page;
struct fd f;
struct inode *inode;
ssize_t ret = -EINVAL;
@@ -1732,10 +1684,9 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
if (o2nm_this_node() == O2NM_MAX_NODES)
goto out;
- fd = simple_strtol(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
+ ret = kstrtoint(page, 0, &fd);
+ if (ret < 0)
goto out;
-
if (fd < 0 || fd >= INT_MAX)
goto out;
@@ -2209,12 +2160,12 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
const char *page,
size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
+ unsigned int tmp;
+ int rv;
- tmp = simple_strtoul(p, &p, 10);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ rv = kstrtouint(page, 10, &tmp);
+ if (rv < 0)
+ return rv;
/* this will validate ranges for us. */
o2hb_dead_threshold_set((unsigned int) tmp);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e169e6..0381ada38534 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -195,13 +195,12 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
size_t count)
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int tmp;
+ int rv;
+ rv = parse_integer(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp >= O2NM_MAX_NODES)
return -ERANGE;
@@ -215,16 +214,15 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ rv = -EEXIST;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
-
+ if (rv < 0)
+ return rv;
return count;
}
static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
@@ -235,13 +233,12 @@ static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
const char *page, size_t count)
{
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ u16 tmp;
+ int rv;
+ rv = kstrtou16(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp == 0)
return -EINVAL;
if (tmp >= (u16)-1)
@@ -305,13 +302,11 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
{
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
- char *p = (char *)page;
ssize_t ret;
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
-
+ ret = kstrtoul(page, 0, &tmp);
+ if (ret < 0)
+ return ret;
tmp = !!tmp; /* boolean of whether this node wants to be local */
/* setting local turns on networking rx for now so we require having
@@ -484,16 +479,15 @@ struct o2nm_cluster_attribute {
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
- unsigned long tmp;
- char *p = (char *)page;
-
- tmp = simple_strtoul(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
- return -EINVAL;
+ unsigned int tmp;
+ int rv;
+ rv = kstrtouint(page, 0, &tmp);
+ if (rv < 0)
+ return rv;
if (tmp == 0)
return -EINVAL;
- if (tmp >= (u32)-1)
+ if (tmp >= (unsigned int)-1)
return -ERANGE;
*val = tmp;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e40dd74..1c91103c1333 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3035,8 +3035,6 @@ local:
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
-
- status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..373a34f97452 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
+ int inode_locked = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock_rw;
}
+ inode_locked = 1;
if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status) {
+ ocfs2_inode_unlock(inode, 1);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock(inode, 1);
return status;
}
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
ssize_t written = 0;
ssize_t ret;
size_t count = iov_iter_count(from), orig_count;
- loff_t old_size;
- u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
int dropped_dio = 0;
+ int append_write = ((iocb->ki_pos + count) >=
+ i_size_read(inode) ? 1 : 0);
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
+ * For append write, we must take rw EX.
*/
- rw_level = (!direct_io || full_coherency);
+ rw_level = (!direct_io || full_coherency || append_write);
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
ocfs2_iocb_set_unaligned_aio(iocb);
}
- /*
- * To later detect whether a journal commit for sync writes is
- * necessary, we sample i_size, and cluster count here.
- */
- old_size = i_size_read(inode);
- old_clusters = OCFS2_I(inode)->ip_clusters;
-
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -2416,7 +2417,7 @@ no_sync:
unaligned_dio = 0;
}
- if (unaligned_dio) {
+ if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..a492e5586df9
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,571 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2007, 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* File check entries are done in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry {
+ struct list_head fs_list;
+ atomic_t fs_count;
+ struct super_block *fs_sb;
+ struct kset *fs_kset;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set file check options */
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned short fe_done:1;
+ unsigned short fe_status:15;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck =
+ __ATTR(filecheck, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ if (!atomic_dec_and_test(&entry->fs_count))
+ wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kset_unregister(entry->fs_kset);
+ kfree(entry->fs_fcheck);
+ kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ list_del(&p->fs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ ocfs2_filecheck_sysfs_free(p);
+ return 0;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->fs_count))
+ wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ atomic_inc(&p->fs_count);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return p;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+ int ret = 0;
+ struct kset *ocfs2_filecheck_kset = NULL;
+ struct ocfs2_filecheck *fcheck = NULL;
+ struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+ struct attribute **attrs = NULL;
+ struct attribute_group attrgp;
+
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ attrs = kmalloc(sizeof(struct attribute *) * 2, GFP_NOFS);
+ if (!attrs) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ attrs[0] = &ocfs2_attr_filecheck.attr;
+ attrs[1] = NULL;
+ memset(&attrgp, 0, sizeof(attrgp));
+ attrgp.attrs = attrs;
+ }
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+ }
+
+ if (strlen(sb->s_id) <= 0) {
+ mlog(ML_ERROR,
+ "Cannot get device basename when create filecheck sysfs\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ ocfs2_filecheck_kset = kset_create_and_add(sb->s_id, NULL,
+ &ocfs2_kset->kobj);
+ if (!ocfs2_filecheck_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = sysfs_create_group(&ocfs2_filecheck_kset->kobj, &attrgp);
+ if (ret)
+ goto error;
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ atomic_set(&entry->fs_count, 1);
+ entry->fs_sb = sb;
+ entry->fs_kset = ocfs2_filecheck_kset;
+ entry->fs_fcheck = fcheck;
+ ocfs2_filecheck_sysfs_add(entry);
+ }
+
+ kfree(attrs);
+ return 0;
+
+error:
+ kfree(attrs);
+ kfree(entry);
+ kfree(fcheck);
+ kset_unregister(ocfs2_filecheck_kset);
+ return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+ return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_ERROR,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 32
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ if (count < 1)
+ return 1;
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+
+ /* too short/long args length */
+ if ((count < 5) || (count > OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if ((strncmp(buf, "FIX ", 4) == 0) ||
+ (strncmp(buf, "fix ", 4) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 4, count - 4, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_FIX;
+ args->fa_ino = val;
+ return 0;
+ } else if ((strncmp(buf, "CHECK ", 6) == 0) ||
+ (strncmp(buf, "check ", 6) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 6, count - 6, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_CHK;
+ args->fa_ino = val;
+ return 0;
+ } else if ((strncmp(buf, "SET ", 4) == 0) ||
+ (strncmp(buf, "set ", 4) == 0)) {
+ if (ocfs2_filecheck_args_get_long(buf + 4, count - 4, &val))
+ return 1;
+
+ args->fa_type = OCFS2_FILECHECK_TYPE_SET;
+ args->fa_len = (unsigned int)val;
+ return 0;
+ } else { /* invalid args */
+ return 1;
+ }
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ret = snprintf(buf, remain, "INO\t\tTYPE\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%u\t%s\n",
+ p->fe_ino, p->fe_type, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ ocfs2_filecheck_sysfs_put(ent);
+ return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ entry->fe_done = 1;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned short
+ocfs2_filecheck_handle(struct super_block *sb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned short ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+ ssize_t ret = 0;
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(buf, count, &args)) {
+ mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ return -EINVAL;
+ }
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ ocfs2_filecheck_sysfs_put(ent);
+ return (!ret ? count : ret);
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_ERROR, "Online file check queue(%u) is full\n",
+ ent->fs_fcheck->fc_max);
+ kfree(entry);
+ entry = NULL;
+ ret = -EBUSY;
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+
+ ent->fs_fcheck->fc_size++;
+ ret = count;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (entry)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+out:
+ ocfs2_filecheck_sysfs_put(ent);
+ return ret;
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..c65fee927387
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,48 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..62a47ad3a8fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -74,6 +75,13 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh, int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +135,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = 0;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
@@ -161,12 +170,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
trace_ocfs2_iget5_locked(inode->i_state);
if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+ (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+ /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+ inode = ERR_PTR(rc);
+ else
+ inode = ERR_PTR(-ESTALE);
goto bail;
}
@@ -494,16 +508,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -531,6 +561,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh)) {
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
@@ -971,6 +1009,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1064,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1238,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,32 +1399,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
@@ -1385,6 +1434,152 @@ bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ }
+
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ goto bail;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ rc = ocfs2_filecheck_validate_inode_block(sb, bh);
+ /* Can't fix invalid inode block */
+ if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
+ return rc;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: try to repair dinode #%llu on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is set\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (changed ||
+ ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh, int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..2152a72123bc 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- wait_queue_head_t append_dio_wq;
-
struct dquot *i_dquot[MAXQUOTAS];
};
@@ -139,6 +137,9 @@ int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
mlog(ML_ERROR, "b_blocknr=%llu\n",
(unsigned long long)bh->b_blocknr);
- BUG();
+
+ lock_buffer(bh);
+ /*
+ * A previous attempt to write this buffer head failed.
+ * Nothing we can do but to retry the write and hope for
+ * the best.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+ unlock_buffer(bh);
}
/* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
+ mutex_lock(&inode->i_mutex);
ret = ocfs2_rw_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+ }
+
+ if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
ret = ocfs2_truncate_file(inode, di_bh,
i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
if (ret)
mlog_errno(ret);
-
- wake_up(&OCFS2_I(inode)->append_dio_wq);
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
unlock_inode:
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
unlock_rw:
ocfs2_rw_unlock(inode, 1);
next:
+ mutex_unlock(&inode->i_mutex);
iput(inode);
- brelse(di_bh);
- di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u used bits, but a count shows %u",
+ ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..eb2667ed4b9b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1035,11 +1035,6 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (child_locked)
- ocfs2_inode_unlock(inode, 1);
-
- ocfs2_inode_unlock(dir, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
iput(orphan_dir);
}
+ if (child_locked)
+ ocfs2_inode_unlock(inode, 1);
+
+ ocfs2_inode_unlock(dir, 1);
+
brelse(fe_bh);
brelse(parent_node_bh);
@@ -1309,6 +1309,15 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ mlog(ML_ERROR, "new dir %llu has been removed, inode %llu "
+ "can not be moved into it.",
+ (unsigned long long)new_dir->i_ino,
+ (unsigned long long)old_inode->i_ino);
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1569,12 +1578,25 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
@@ -1633,21 +1655,9 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
status = 0;
bail:
- if (rename_lock)
- ocfs2_rename_unlock(osb);
-
if (handle)
ocfs2_commit_trans(osb, handle);
- if (parents_locked)
- ocfs2_double_unlock(old_dir, new_dir);
-
- if (old_child_locked)
- ocfs2_inode_unlock(old_inode, 1);
-
- if (new_child_locked)
- ocfs2_inode_unlock(new_inode, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1665,18 @@ bail:
iput(orphan_dir);
}
+ if (new_child_locked)
+ ocfs2_inode_unlock(new_inode, 1);
+
+ if (old_child_locked)
+ ocfs2_inode_unlock(old_inode, 1);
+
+ if (parents_locked)
+ ocfs2_double_unlock(old_dir, new_dir);
+
+ if (rename_lock)
+ ocfs2_rename_unlock(osb);
+
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
@@ -2601,27 +2623,6 @@ leave:
return status;
}
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- return 0;
- }
-
- di = (struct ocfs2_dinode *) di_bh->b_data;
- ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode)
{
@@ -2633,7 +2634,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct ocfs2_dinode *di = NULL;
-restart:
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
mlog_errno(status);
@@ -2643,15 +2643,21 @@ restart:
di = (struct ocfs2_dinode *) di_bh->b_data;
/*
* Another append dio crashed?
- * If so, wait for recovery first.
+ * If so, manually recover it first.
*/
if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
- ocfs2_dio_orphan_recovered(inode),
- msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
- goto restart;
+ status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
}
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..d9205e07aaef 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1540,6 +1540,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_block,
(unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ced70c8139f7..74d27ddb4d4f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -368,9 +368,9 @@ static int ocfs2_control_get_this_node(void)
static int ocfs2_control_do_setnode_msg(struct file *file,
struct ocfs2_control_message_setn *msg)
{
- long nodenum;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
+ int nodenum;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -384,12 +384,12 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
return -EINVAL;
msg->space = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->nodestr, 16, &nodenum);
+ if (rv < 0)
+ return rv;
+ if (msg->nodestr[rv])
return -EINVAL;
-
- if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
- (nodenum > INT_MAX) || (nodenum < 0))
+ if (nodenum < 0)
return -ERANGE;
p->op_this_node = nodenum;
@@ -399,11 +399,11 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
{
- long major, minor;
- char *ptr = NULL;
+ u8 major, minor;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
&ocfs2_user_plugin.sp_max_proto;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -418,11 +418,15 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- major = simple_strtol(msg->major, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->major, 16, &major);
+ if (rv < 0)
+ return rv;
+ if (msg->major[rv])
return -EINVAL;
- minor = simple_strtol(msg->minor, &ptr, 16);
- if (!ptr || *ptr)
+ rv = parse_integer(msg->minor, 16, &minor);
+ if (rv < 0)
+ return rv;
+ if (msg->minor[rv])
return -EINVAL;
/*
@@ -430,11 +434,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
* must be between 0 and 255, inclusive. The version passed in
* must be within the maximum version supported by the filesystem.
*/
- if ((major == LONG_MIN) || (major == LONG_MAX) ||
- (major > (u8)-1) || (major < 1))
- return -ERANGE;
- if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
- (minor > (u8)-1) || (minor < 0))
+ if (major < 1)
return -ERANGE;
if ((major != max->pv_major) ||
(minor > max->pv_minor))
@@ -449,8 +449,8 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
static int ocfs2_control_do_down_msg(struct file *file,
struct ocfs2_control_message_down *msg)
{
- long nodenum;
- char *p = NULL;
+ int nodenum;
+ int rv;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -465,12 +465,12 @@ static int ocfs2_control_do_down_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &p, 16);
- if (!p || *p)
+ rv = parse_integer(msg->nodestr, 16, &nodenum);
+ if (rv < 0)
+ return rv;
+ if (msg->nodestr[rv])
return -EINVAL;
-
- if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
- (nodenum > INT_MAX) || (nodenum < 0))
+ if (nodenum < 0)
return -ERANGE;
ocfs2_control_send_down(msg->uuid, nodenum);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..0456ae399bf7 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -181,44 +181,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -233,20 +224,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +242,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -834,9 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -1370,12 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1890,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -2429,12 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..5ef88b8d1bf1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,6 +74,7 @@
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
@@ -192,6 +193,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +226,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1202,6 +1205,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
+ /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+ ocfs2_filecheck_create_sysfs(sb);
+
return status;
read_super_error:
@@ -1330,10 +1336,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1545,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1550,8 +1567,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
- osb->osb_cluster_stack);
+ seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+ OCFS2_STACK_LABEL_LEN);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1658,6 +1675,7 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
+ ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1746,8 +1764,6 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- init_waitqueue_head(&oi->append_dio_wq);
-
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
@@ -2541,31 +2557,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
- panic("OCFS2: (device %s): panic forced after error\n",
- sb->s_id);
+ int rv = 0;
ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
-
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+ panic("OCFS2: (device %s): panic forced after error\n",
+ sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
+
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
+
+ return rv;
}
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
@@ -2577,12 +2605,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2627,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list + prefix_len, name, name_len);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ufs = sb->s_fs_info;
- seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ seq_show_option(m, "lowerdir", ufs->config.lowerdir);
if (ufs->config.upperdir) {
- seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
- seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ seq_show_option(m, "upperdir", ufs->config.upperdir);
+ seq_show_option(m, "workdir", ufs->config.workdir);
}
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
- kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
rcu_read_lock();
cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_permitted = cred->cap_permitted;
cap_effective = cred->cap_effective;
cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
rcu_read_unlock();
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
}
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..b25eee4cead5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page, *tmp;
- ssize_t length;
uid_t loginuid;
kuid_t kloginuid;
+ int rv;
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
}
rcu_read_unlock();
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
-
if (*ppos != 0) {
/* No partial writes. */
return -EINVAL;
}
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
- return -ENOMEM;
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free_page;
-
- page[count] = '\0';
- loginuid = simple_strtoul(page, &tmp, 10);
- if (tmp == page) {
- length = -EINVAL;
- goto out_free_page;
- }
+ rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ if (rv < 0)
+ return rv;
/* is userspace tring to explicitly UNSET the loginuid? */
if (loginuid == AUDIT_UID_UNSET) {
kloginuid = INVALID_UID;
} else {
kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
- if (!uid_valid(kloginuid)) {
- length = -EINVAL;
- goto out_free_page;
- }
+ if (!uid_valid(kloginuid))
+ return -EINVAL;
}
- length = audit_set_loginuid(kloginuid);
- if (likely(length == 0))
- length = count;
-
-out_free_page:
- free_page((unsigned long) page);
- return length;
+ rv = audit_set_loginuid(kloginuid);
+ if (rv < 0)
+ return rv;
+ return count;
}
static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
+ char buffer[PROC_NUMBUF];
int make_it_fail;
+ int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
+ rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ if (rv < 0)
+ return rv;
if (make_it_fail < 0 || make_it_fail > 1)
return -EINVAL;
@@ -1836,8 +1818,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;
- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
{
struct task_struct *task;
struct mm_struct *mm;
- char buffer[PROC_NUMBUF], *end;
unsigned int val;
int ret;
int i;
unsigned long mask;
- ret = -EFAULT;
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- goto out_no_task;
-
- ret = -EINVAL;
- val = (unsigned int)simple_strtoul(buffer, &end, 0);
- if (*end == '\n')
- end++;
- if (end - buffer == 0)
- goto out_no_task;
+ ret = kstrtouint_from_user(buf, count, 0, &val);
+ if (ret < 0)
+ return ret;
ret = -ESRCH;
task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
- ret = end - buffer;
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
out_no_mm:
put_task_struct(task);
out_no_task:
- return ret;
+ if (ret < 0)
+ return ret;
+ return count;
}
static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c3188e..ff3ffc76a937 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
#include "internal.h"
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
int rv;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
rv = __xlate_proc_name(name, ret, residual);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return rv;
}
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
{
struct inode *inode;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
d_add(dentry, inode);
return NULL;
}
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
if (!dir_emit_dots(file, ctx))
return 0;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 0;
}
if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
do {
struct proc_dir_entry *next;
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
if (!dir_emit(ctx, de->name, de->namelen,
de->low_ino, de->mode >> 12)) {
pde_put(de);
return 0;
}
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
ctx->pos++;
next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 1;
}
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
if (ret)
return ret;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_free_inum(dp->low_ino);
return -EEXIST;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return 0;
}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return;
}
len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
rb_erase(&de->subdir_node, &parent->subdir);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
len = strlen(fn);
root = pde_subdir_find(parent, fn, len);
if (!root) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_entry_rundown(de);
next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
break;
pde_put(de);
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
de = next;
}
pde_put(root);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..93484034a03d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <linux/kernel-page-flags.h>
#include <asm/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
/* /proc/kpagecount - an array exposing page counts
*
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
.read = kpageflags_read,
};
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+ .llseek = mem_lseek,
+ .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..66954f5d753c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -69,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
+ hugetlb_report_usage(m, mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -445,7 +447,9 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long hugetlb;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +462,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
@@ -492,9 +496,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -585,6 +600,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
@@ -597,6 +613,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
};
size_t i;
@@ -610,12 +628,38 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
seq_putc(m, '\n');
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ }
+ if (page)
+ mss->hugetlb += huge_page_size(hstate_vma(vma));
+ return 0;
+}
+#endif /* HUGETLB_PAGE */
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+ .hugetlb_entry = smaps_hugetlb_range,
+#endif
.mm = vma->vm_mm,
.private = &mss,
};
@@ -637,7 +681,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "HugetlbPages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -651,7 +697,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.hugetlb >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -710,23 +758,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -808,6 +839,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -835,6 +867,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -887,13 +920,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -961,36 +987,26 @@ typedef struct {
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1011,7 +1027,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1031,7 +1047,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1042,67 +1058,42 @@ out:
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
-
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
-}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
+ return make_pme(frame, flags);
}
-#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1111,41 +1102,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1158,40 +1166,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1209,7 +1221,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1227,42 +1241,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
- pm.v2 = soft_dirty_cleared;
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
-
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1273,10 +1282,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1303,7 +1312,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1313,24 +1322,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1338,6 +1354,7 @@ const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",acl");
if (REISERFS_SB(s)->s_jdev)
- seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+ seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
if (journal->j_max_commit_age != journal->j_default_max_commit_age)
seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
#ifdef CONFIG_QUOTA
if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota",
+ REISERFS_SB(s)->s_qf_names[USRQUOTA]);
else if (opts & (1 << REISERFS_USRQUOTA))
seq_puts(seq, ",usrquota");
if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota",
+ REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
else if (opts & (1 << REISERFS_GRPQUOTA))
seq_puts(seq, ",grpquota");
if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..263b125dbcf4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_pad);
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii)
+{
+ const u8 *ptr = buf;
+ int i, linelen, remaining = len;
+ int ret;
+
+ if (rowsize != 16 && rowsize != 32)
+ rowsize = 16;
+
+ for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+ linelen = min(remaining, rowsize);
+ remaining -= rowsize;
+
+ switch (prefix_type) {
+ case DUMP_PREFIX_ADDRESS:
+ seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+ break;
+ case DUMP_PREFIX_OFFSET:
+ seq_printf(m, "%s%.8x: ", prefix_str, i);
+ break;
+ default:
+ seq_printf(m, "%s", prefix_str);
+ break;
+ }
+
+ ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+ m->buf + m->count, m->size - m->count,
+ ascii);
+ if (ret >= m->size - m->count) {
+ seq_set_overflow(m);
+ } else {
+ m->count += ret;
+ seq_putc(m, '\n');
+ }
+ }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/super.c b/fs/super.c
index 954aeb80e202..3f3cb04c9aed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -49,8 +49,8 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
- * take a passive reference to the superblock to avoid this from occurring.
+ * shrinker path. Hence we take a passive reference to the superblock to avoid
+ * this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
@@ -121,8 +121,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
* Don't call trylock_super as it is a potential
* scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway.
- * Call to super_cache_count with shrinker_rwsem held
- * ensures the safety of call to list_lru_shrink_count() and
+ * SRCU guarantees object validity across this call -- thus
+ * it is safe to call list_lru_shrink_count() and
* s_op->nr_cached_objects().
*/
if (sb->s_op && sb->s_op->nr_cached_objects)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..634e676072cb
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1330 @@
+/*
+ * fs/userfaultfd.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+ UFFD_STATE_WAIT_API,
+ UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ struct seqcount refile_seq;
+ /* pseudo fd refcounting */
+ atomic_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* state machine */
+ enum userfaultfd_state state;
+ /* released */
+ bool released;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_t wq;
+ struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ ret = wake_up_state(wq->private, mode);
+ if (ret)
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the
+ * other CPUs, the waitqueue may disappear from under
+ * us, see the !list_empty_careful() in
+ * handle_userfault(). try_to_wake_up() has an
+ * implicit smp_mb__before_spinlock, and the
+ * wq->private is read before calling the extern
+ * function "wake_up_state" (which in turns calls
+ * try_to_wake_up). While the spin_lock;spin_unlock;
+ * wouldn't be enough, the smp_mb__before_spinlock is
+ * enough to avoid an explicit smp_mb() here.
+ */
+ list_del_init(&wq->task_list);
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ if (!atomic_inc_not_zero(&ctx->refcount))
+ BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
+ if (flags & FAULT_FLAG_WRITE)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ /*
+ * READ_ONCE must function as a barrier with narrower scope
+ * and it must be equivalent to:
+ * _pmd = *pmd; barrier();
+ *
+ * This is to deal with the instability (as in
+ * pmd_trans_unstable) of the pmd.
+ */
+ _pmd = READ_ONCE(*pmd);
+ if (!pmd_present(_pmd))
+ goto out;
+
+ ret = false;
+ if (pmd_trans_huge(_pmd))
+ goto out;
+
+ /*
+ * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+ * and use the standard pte_offset_map() instead of parsing _pmd.
+ */
+ pte = pte_offset_map(pmd, address);
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (pte_none(*pte))
+ ret = true;
+ pte_unmap(pte);
+
+out:
+ return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ int ret;
+ bool must_wait, return_to_userland;
+
+ BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ ret = VM_FAULT_SIGBUS;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ BUG_ON(ctx->mm != mm);
+
+ VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+ VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+ /*
+ * If it's already released don't get it. This avoids to loop
+ * in __get_user_pages if userfaultfd_release waits on the
+ * caller of handle_userfault to release the mmap_sem.
+ */
+ if (unlikely(ACCESS_ONCE(ctx->released)))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ /* take the reference before dropping the mmap_sem */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(address, flags, reason);
+ uwq.ctx = ctx;
+
+ return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ up_read(&mm->mmap_sem);
+
+ if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ (return_to_userland ? !signal_pending(current) :
+ !fatal_signal_pending(current)))) {
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+ ret |= VM_FAULT_MAJOR;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (return_to_userland) {
+ if (signal_pending(current) &&
+ !fatal_signal_pending(current)) {
+ /*
+ * If we got a SIGSTOP or SIGCONT and this is
+ * a normal userland page fault, just let
+ * userland return so the signal will be
+ * handled and gdb debugging works. The page
+ * fault code immediately after we return from
+ * this function is going to release the
+ * mmap_sem and it's not depending on it
+ * (unlike gup would if we were not to return
+ * VM_FAULT_RETRY).
+ *
+ * If a fatal signal is pending we still take
+ * the streamlined VM_FAULT_RETRY failure path
+ * and there's no need to retake the mmap_sem
+ * in such case.
+ */
+ down_read(&mm->mmap_sem);
+ ret = 0;
+ }
+ }
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.task_list)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+ unsigned long new_flags;
+
+ ACCESS_ONCE(ctx->released) = true;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_sem. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ up_write(&mm->mmap_sem);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ unsigned int ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ switch (ctx->state) {
+ case UFFD_STATE_WAIT_API:
+ return POLLERR;
+ case UFFD_STATE_RUNNING:
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return POLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
+ return ret;
+ default:
+ BUG();
+ }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.task_list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ write_seqcount_end(&ctx->refile_seq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->fd_wqh.lock);
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ int no_wait = file->f_flags & O_NONBLOCK;
+
+ if (ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ for (;;) {
+ if (count < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = O_NONBLOCK;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned long start, end;
+
+ start = range->start;
+ end = range->start + range->len;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned seq;
+ bool need_wakeup;
+
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or up_read(mmap_sem) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start < mmap_min_addr)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ unsigned long vm_flags, new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+ UFFDIO_REGISTER_MODE_WP))
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ vm_flags |= VM_UFFD_WP;
+ /*
+ * FIXME: remove the below error constraint by
+ * implementing the wprotect tracking mode.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ ((struct vm_userfaultfd_ctx){ ctx }));
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+ if (!ret) {
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(UFFD_API_RANGE_IOCTLS,
+ &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ unsigned long new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ ret = -EINVAL;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_BUG_ON(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+ * double check for wraparound just in case. copy_from_user()
+ * will later check uffdio_copy.src + uffdio_copy.len to fit
+ * in the userland range.
+ */
+ ret = -EINVAL;
+ if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+ goto out;
+ if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ BUG_ON(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ int ret;
+
+ ret = -EINVAL;
+ if (ctx->state != UFFD_STATE_WAIT_API)
+ goto out;
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ret = -EINVAL;
+ goto out;
+ }
+ uffdio_api.features = UFFD_API_FEATURES;
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ctx->state = UFFD_STATE_RUNNING;
+ ret = 0;
+out:
+ return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ switch(cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ total++;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, UFFD_API_FEATURES,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read = userfaultfd_read,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = userfaultfd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_init(&ctx->refile_seq);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase. In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided. Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+ struct file *file;
+ struct userfaultfd_ctx *ctx;
+
+ BUG_ON(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+ file = ERR_PTR(-EINVAL);
+ if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ goto out;
+
+ file = ERR_PTR(-ENOMEM);
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto out;
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->state = UFFD_STATE_WAIT_API;
+ ctx->released = false;
+ ctx->mm = current->mm;
+ /* prevent the mm struct to be freed */
+ atomic_inc(&ctx->mm->mm_users);
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (IS_ERR(file))
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+ return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = userfaultfd_file_create(flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1ccf8264..c79b717d9b88 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de2c2376242b..e78feb400e22 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1546,8 +1546,36 @@ xfs_filemap_fault(
return ret;
}
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
@@ -1560,7 +1588,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f98ce83b7bc4..507b269e6c56 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -511,9 +511,9 @@ xfs_showargs(
seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
if (mp->m_logname)
- seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+ seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
if (mp->m_rtname)
- seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+ seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
if (mp->m_dalign > 0)
seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9aeeb21bc3d0..5ed36b1e04c1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index 316bd043319e..734ad4db388c 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -35,6 +35,12 @@ extern void early_ioremap_setup(void);
*/
extern void early_ioremap_reset(void);
+/*
+ * Early copy from unmapped memory to kernel mapped memory.
+ */
+extern void copy_from_early_mem(void *dest, phys_addr_t src,
+ unsigned long size);
+
#else
static inline void early_ioremap_init(void) { }
static inline void early_ioremap_setup(void) { }
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
+ kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
}
#endif
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+ return cap_issubset(cred->cap_ambient,
+ cap_intersect(cred->cap_permitted,
+ cred->cap_inheritable));
+}
+
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
diff --git a/include/linux/dax.h b/include/linux/dax.h
new file mode 100644
index 000000000000..b415e521528d
--- /dev/null
+++ b/include/linux/dax.h
@@ -0,0 +1,39 @@
+#ifndef _LINUX_DAX_H
+#define _LINUX_DAX_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+ get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+#else
+static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags, get_block_t gb,
+ dax_iodone_t di)
+{
+ return VM_FAULT_FALLBACK;
+}
+#define __dax_pmd_fault dax_pmd_fault
+#endif
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+ return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+}
+#endif
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index e1043f79122f..53ba737505df 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool);
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle);
+static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
+}
+
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 102cf728bee0..72d8a844c692 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -52,7 +52,6 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
-struct vm_fault;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -1623,7 +1622,6 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
@@ -2679,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
- get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
-
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
loff_t file_offset);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0390ee69c439..533c4408529a 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,49 @@ struct fsnotify_group {
#define FSNOTIFY_EVENT_INODE 2
/*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
* fsnotify listener to indicate they are either no longer interested in events
* of a type matching mask or only interested in those events.
*
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access). Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
+ * These are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access). Some fsnotify
+ * users (such as dnotify) will flush these when the open fd is closed and not
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
*/
struct fsnotify_mark {
- __u32 mask; /* mask this mark is for */
- /* we hold ref for each i_list and g_list. also one ref for each 'thing'
+ /* Mask this mark is for [mark->lock, group->mark_mutex] */
+ __u32 mask;
+ /* We hold one for presence in g_list. Also one ref for each 'thing'
* in kernel that found and may be using this mark. */
- atomic_t refcnt; /* active things looking at this mark */
- struct fsnotify_group *group; /* group this mark is for */
- struct list_head g_list; /* list of marks by group->i_fsnotify_marks
- * Also reused for queueing mark into
- * destroy_list when it's waiting for
- * the end of SRCU period before it can
- * be freed */
- spinlock_t lock; /* protect group and inode */
- struct hlist_node obj_list; /* list of marks for inode / vfsmount */
- struct list_head free_list; /* tmp list used when freeing this mark */
- union {
+ atomic_t refcnt;
+ /* Group this mark is for. Set on mark creation, stable until last ref
+ * is dropped */
+ struct fsnotify_group *group;
+ /* List of marks by group->i_fsnotify_marks. Also reused for queueing
+ * mark into destroy_list when it's waiting for the end of SRCU period
+ * before it can be freed. [group->mark_mutex] */
+ struct list_head g_list;
+ /* Protects inode / mnt pointers, flags, masks */
+ spinlock_t lock;
+ /* List of marks for inode / vfsmount [obj_lock] */
+ struct hlist_node obj_list;
+ union { /* Object pointer [mark->lock, group->mark_mutex] */
struct inode *inode; /* inode this mark is associated with */
struct vfsmount *mnt; /* vfsmount this mark is associated with */
};
- __u32 ignored_mask; /* events types to ignore */
+ /* Events types to ignore [mark->lock, group->mark_mutex] */
+ __u32 ignored_mask;
#define FSNOTIFY_MARK_FLAG_INODE 0x01
#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02
#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
#define FSNOTIFY_MARK_FLAG_ALIVE 0x10
- unsigned int flags; /* vfsmount or inode mark? */
+#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20
+ unsigned int flags; /* flags [mark->lock] */
void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
};
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group);
+/* detach mark from inode / mount list, group list, drop inode reference */
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* run all the marks in a group, and clear all of the vfsmount marks */
extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
/* run all the marks in a group, and clear all of the inode marks */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
genpool_algo_t algo; /* allocation function */
void *data;
+
+ const char *name;
};
/*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr, void *data);
extern struct gen_pool *devm_gen_pool_create(struct device *dev,
- int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+ int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
size_t size);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ad35f300b9a4..3bd64b115999 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -63,7 +63,10 @@ struct vm_area_struct;
* but it is definitely preferable to use the flag rather than opencode endless
* loop around allocator.
*
- * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed. The OOM killer is not called with the current
+ * implementation.
*
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
* mechanism or reclaimed
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f05159..412013b0bf2f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
@@ -33,6 +36,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
+int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned long pfn, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
@@ -56,6 +61,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -122,7 +128,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
#endif
extern int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice);
-extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
@@ -138,15 +144,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
else
return 0;
}
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
-{
- if (!vma->anon_vma || vma->vm_ops)
- return;
- __vma_adjust_trans_huge(vma, start, end, adjust_next);
-}
static inline int hpage_nr_pages(struct page *page)
{
if (unlikely(PageTransHuge(page)))
@@ -164,6 +161,13 @@ static inline bool is_huge_zero_page(struct page *page)
return ACCESS_ONCE(huge_zero_page) == page;
}
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_page(pmd_page(pmd));
+}
+
+struct page *get_huge_zero_page(void);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d891f949466a..0dfe80d73f07 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,9 @@ struct resv_map {
struct kref refs;
spinlock_t lock;
struct list_head regions;
+ long adds_in_progress;
+ struct list_head region_cache;
+ long region_cache_count;
};
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -80,11 +83,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed);
int dequeue_hwpoisoned_huge_page(struct page *page);
bool isolate_huge_page(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
void free_huge_page(struct page *page);
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+extern struct mutex *hugetlb_fault_mutex_table;
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -320,9 +330,13 @@ struct huge_bootmem_page {
#endif
};
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -330,10 +344,6 @@ int __init alloc_bootmem_huge_page(struct hstate *h);
void __init hugetlb_add_hstate(unsigned order);
struct hstate *size_to_hstate(unsigned long size);
-#ifndef HUGE_MAX_HSTATE
-#define HUGE_MAX_HSTATE 1
-#endif
-
extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;
@@ -469,8 +479,21 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
+
+static inline void inc_hugetlb_count(struct mm_struct *mm, struct hstate *h)
+{
+ atomic_long_inc(&mm->hugetlb_usage.count[hstate_index(h)]);
+}
+
+static inline void dec_hugetlb_count(struct mm_struct *mm, struct hstate *h)
+{
+ atomic_long_dec(&mm->hugetlb_usage.count[hstate_index(h)]);
+}
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_noerr(v, a, r) NULL
#define alloc_bootmem_huge_page(h) NULL
@@ -504,6 +527,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
{
return &mm->page_table_lock;
}
+
+static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
+{
+}
+
+static inline void dec_hugetlb_count(struct mm_struct *mm, struct hstate *h)
+{
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5582410727cb..45a9fdcc0844 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -10,6 +10,7 @@
#include <linux/bitops.h>
#include <linux/log2.h>
#include <linux/typecheck.h>
+#include <linux/parse-integer.h>
#include <linux/printk.h>
#include <linux/dynamic_debug.h>
#include <asm/byteorder.h>
@@ -264,132 +265,10 @@ void do_exit(long error_code)
void complete_and_exit(struct completion *, long)
__noreturn;
-/* Internal, do not use. */
-int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
-int __must_check _kstrtol(const char *s, unsigned int base, long *res);
-
-int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
-
-/**
- * kstrtoul - convert a string to an unsigned long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
-*/
-static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
- */
- if (sizeof(unsigned long) == sizeof(unsigned long long) &&
- __alignof__(unsigned long) == __alignof__(unsigned long long))
- return kstrtoull(s, base, (unsigned long long *)res);
- else
- return _kstrtoul(s, base, res);
-}
-
-/**
- * kstrtol - convert a string to a long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
+/*
+ * Obsolete, do not use.
+ * Use parse_integer(), kstrto*(), kstrto*_from_user(), sscanf().
*/
-static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
-{
- /*
- * We want to shortcut function call, but
- * __builtin_types_compatible_p(long, long long) = 0.
- */
- if (sizeof(long) == sizeof(long long) &&
- __alignof__(long) == __alignof__(long long))
- return kstrtoll(s, base, (long long *)res);
- else
- return _kstrtol(s, base, res);
-}
-
-int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
-int __must_check kstrtoint(const char *s, unsigned int base, int *res);
-
-static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
-{
- return kstrtoull(s, base, res);
-}
-
-static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
-{
- return kstrtoll(s, base, res);
-}
-
-static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
-{
- return kstrtouint(s, base, res);
-}
-
-static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
-{
- return kstrtoint(s, base, res);
-}
-
-int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
-int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
-int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
-int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
-
-int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
-int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
-int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
-int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
-int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
-int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
-int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
-int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
-int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
-
-static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
-{
- return kstrtoull_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
-{
- return kstrtoll_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
-{
- return kstrtouint_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
-{
- return kstrtoint_from_user(s, count, base, res);
-}
-
-/* Obsolete, do not use. Use kstrto<foo> instead */
-
extern unsigned long simple_strtoul(const char *,char **,unsigned int);
extern long simple_strtol(const char *,char **,unsigned int);
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index b63218f68c4b..2da38f093391 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,7 +16,7 @@
#include <uapi/linux/kexec.h>
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <linux/list.h>
#include <linux/linkage.h>
#include <linux/compat.h>
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -318,13 +320,24 @@ int crash_shrink_memory(unsigned long new_size);
size_t crash_get_memory_size(void);
void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
-#else /* !CONFIG_KEXEC */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len);
+void * __weak arch_kexec_kernel_image_load(struct kimage *image);
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len);
+int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
+ Elf_Shdr *sechdrs, unsigned int relsec);
+int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec);
+
+#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
struct task_struct;
static inline void crash_kexec(struct pt_regs *regs) { }
static inline int kexec_should_crash(struct task_struct *p) { return 0; }
#define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
#endif /* !defined(__ASSEBMLY__) */
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 0555cc66a15b..fcfd2bf14d3f 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -85,8 +85,6 @@ enum umh_disable_depth {
UMH_DISABLED,
};
-extern void usermodehelper_init(void);
-
extern int __usermodehelper_disable(enum umh_disable_depth depth);
extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[], ...);
#define kthread_create(threadfn, data, namefmt, arg...) \
- kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+ kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cc4b01972060..c518eb589260 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
+bool memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
@@ -323,7 +325,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
int memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
int memblock_is_reserved(phys_addr_t addr);
-int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73b02b0a8f60..ad800e62cb7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
struct mem_cgroup;
struct page;
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
MEMCG_NR_EVENTS,
};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
+ MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+ /* Currently active and new sockets should be assigned to cgroups */
+ MEMCG_SOCK_ACTIVE,
+ /* It was ever activated; we must disarm static keys on destruction */
+ MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+ struct page_counter memory_allocated; /* Current allocated memory. */
+ struct percpu_counter sockets_allocated; /* Current number of sockets. */
+ int memory_pressure;
+ long sysctl_mem[3];
+ unsigned long flags;
+ /*
+ * memcg field is used to find which memcg we belong directly
+ * Each memcg struct can hold more than one cg_proto, so container_of
+ * won't really cut.
+ *
+ * The elegant solution would be having an inverse function to
+ * proto_cgroup in struct proto, but that means polluting the structure
+ * for everybody, instead of just for memcg users.
+ */
+ struct mem_cgroup *memcg;
+};
+
#ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+ struct mem_cgroup *position;
+ /* scan generation, increased every round-trip */
+ unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+ struct lruvec lruvec;
+ unsigned long lru_size[NR_LRU_LISTS];
+
+ struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
+ /* use container_of */
+};
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+ struct eventfd_ctx *eventfd;
+ unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+ /* An array index points to threshold just below or equal to usage. */
+ int current_threshold;
+ /* Size of entries[] */
+ unsigned int size;
+ /* Array of thresholds */
+ struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+
+ /* Accounted resources */
+ struct page_counter memory;
+ struct page_counter memsw;
+ struct page_counter kmem;
+
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
+ unsigned long soft_limit;
+
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
+ /* css_online() has been completed */
+ int initialized;
+
+ /*
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+
+ /* protected by memcg_oom_lock */
+ bool oom_lock;
+ int under_oom;
+
+ int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
+
+ /* protect arrays of thresholds */
+ struct mutex thresholds_lock;
+
+ /* thresholds for memory usage. RCU-protected */
+ struct mem_cgroup_thresholds thresholds;
+
+ /* thresholds for mem+swap usage. RCU-protected */
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+
+ /*
+ * Should we move charges of a task when a task is moved into this
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+ /*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+ /* taken only while moving_account > 0 */
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
+ /*
+ * percpu counter.
+ */
+ struct mem_cgroup_stat_cpu __percpu *stat;
+ spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+ struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
+ int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
+#endif
+
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
+};
extern struct cgroup_subsys_state *mem_cgroup_root_css;
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
- unsigned int nr);
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
+}
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@ -90,15 +304,29 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
- struct mem_cgroup *root);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+ return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+ struct mem_cgroup *,
+ struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+ struct mem_cgroup *root)
+{
+ if (root == memcg)
+ return true;
+ if (!root->use_hierarchy)
+ return false;
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
@@ -114,24 +342,68 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
- struct mem_cgroup *,
- struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+ if (memory_cgrp_subsys.disabled)
+ return true;
+ return false;
+}
/*
* For memory reclaim.
*/
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
- struct task_struct *p);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+ unsigned long inactive_ratio;
+ unsigned long inactive;
+ unsigned long active;
+ unsigned long gb;
+
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
+}
+
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+ struct task_struct *p);
static inline void mem_cgroup_oom_enable(void)
{
@@ -156,18 +428,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
extern int do_swap_account;
#endif
-static inline bool mem_cgroup_disabled(void)
-{
- if (memory_cgrp_subsys.disabled)
- return true;
- return false;
-}
-
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val);
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
+}
+
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
@@ -184,13 +464,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
enum vm_event_item idx)
{
+ struct mem_cgroup *memcg;
+
if (mem_cgroup_disabled())
return;
- __mem_cgroup_count_vm_event(mm, idx);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ goto out;
+
+ switch (idx) {
+ case PGFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+ break;
+ case PGMAJFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
@@ -199,8 +497,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
#else /* CONFIG_MEMCG */
struct mem_cgroup;
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
@@ -258,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
return &zone->lruvec;
}
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- return NULL;
-}
-
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
{
@@ -275,12 +566,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
-static inline struct cgroup_subsys_state
- *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return NULL;
-}
-
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -428,8 +713,8 @@ static inline void sock_release_memcg(struct sock *sk)
extern struct static_key memcg_kmem_enabled_key;
extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
/*
* Helper macro to loop through all memcg-specific caches. Callers must still
@@ -444,7 +729,10 @@ static inline bool memcg_kmem_enabled(void)
return static_key_false(&memcg_kmem_enabled_key);
}
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return memcg->kmem_acct_active;
+}
/*
* In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +751,15 @@ void __memcg_kmem_commit_charge(struct page *page,
struct mem_cgroup *memcg, int order);
void __memcg_kmem_uncharge_pages(struct page *page, int order);
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc8b7161b36c..9dd5da678005 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -125,8 +125,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -137,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
@@ -246,7 +249,10 @@ struct vm_fault {
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
+ int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+ pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
/* notification that a previously read-only page is about to become
@@ -305,18 +311,6 @@ struct inode;
#define page_private(page) ((page)->private)
#define set_page_private(page, v) ((page)->private = (v))
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
- page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
- return page->index;
-}
-
/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
@@ -357,18 +351,6 @@ static inline int get_page_unless_zero(struct page *page)
return atomic_inc_not_zero(&page->_count);
}
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
- return atomic_add_unless(&page->_count, -1, 1);
-}
-
extern int page_is_ram(unsigned long pfn);
enum {
@@ -445,46 +427,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
#endif
}
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
- struct page *head = tail->first_page;
-
- /*
- * page->first_page may be a dangling pointer to an old
- * compound page, so recheck that it is still a tail
- * page before returning.
- */
- smp_rmb();
- if (likely(PageTail(tail)))
- return head;
- return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return compound_head_by_tail(page);
- return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
-}
-
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
@@ -1308,6 +1250,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
}
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops;
+}
+
static inline int stack_guard_page_start(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -1884,7 +1831,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *);
+ struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
@@ -2122,6 +2069,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
+#define FOLL_MLOCK 0x1000 /* lock present pages */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
@@ -2234,6 +2182,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
extern int get_hwpoison_page(struct page *page);
+extern void put_hwpoison_page(struct page *page);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..f84bf89c5497 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -235,7 +235,7 @@ struct page_frag_cache {
bool pfmemalloc;
};
-typedef unsigned long __nocast vm_flags_t;
+typedef unsigned long vm_flags_t;
/*
* A region containing a mapping of a non-memory backed file under NOMMU
@@ -256,6 +256,16 @@ struct vm_region {
* this region */
};
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+ struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};
struct core_thread {
@@ -355,6 +366,16 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
+#ifdef CONFIG_HUGETLB_PAGE
+
+#ifndef HUGE_MAX_HSTATE
+#define HUGE_MAX_HSTATE 1
+#endif
+struct hugetlb_usage {
+ atomic_long_t count[HUGE_MAX_HSTATE];
+};
+#endif
+
struct kioctx_table;
struct mm_struct {
struct vm_area_struct *mmap; /* list of VMAs */
@@ -475,6 +496,10 @@ struct mm_struct {
/* address of the bounds directory */
void __user *bd_addr;
#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+ struct hugetlb_usage hugetlb_usage;
+#endif
};
static inline void mm_init_cpumask(struct mm_struct *mm)
@@ -543,6 +568,7 @@ enum tlb_flush_reason {
TLB_REMOTE_SHOOTDOWN,
TLB_LOCAL_SHOOTDOWN,
TLB_LOCAL_MM_SHOOTDOWN,
+ TLB_REMOTE_SEND_IPI,
NR_TLB_FLUSH_REASONS,
};
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 61cd67f4d788..a1a210d59961 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -66,6 +66,16 @@ struct mmu_notifier_ops {
unsigned long end);
/*
+ * clear_young is a lightweight version of clear_flush_young. Like the
+ * latter, it is supposed to test-and-clear the young/accessed bitflag
+ * in the secondary pte, but it may omit flushing the secondary tlb.
+ */
+ int (*clear_young)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
+
+ /*
* test_young is called to check the young/accessed bitflag in
* the secondary pte. This is used to know if the page is
* frequently used without actually clearing the flag or tearing
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ if (mm_has_notifiers(mm))
+ return __mmu_notifier_clear_young(mm, start, end);
+ return 0;
+}
+
static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \
})
+#define ptep_clear_young_notify(__vma, __address, __ptep) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
+ ___address + PAGE_SIZE); \
+ __young; \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
+({ \
+ int __young; \
+ struct vm_area_struct *___vma = __vma; \
+ unsigned long ___address = __address; \
+ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
+ ___address + PMD_SIZE); \
+ __young; \
+})
+
#define ptep_clear_flush_notify(__vma, __address, __ptep) \
({ \
unsigned long ___addr = __address & PAGE_MASK; \
@@ -427,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
#endif
};
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
- unsigned long start_pfn;
- unsigned long end_pfn;
- int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5791e3229068..78488e099ce7 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void)
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
#else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
#endif
/*
@@ -86,6 +84,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_watchdog_cpumask(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
+#else
+static inline int lockup_detector_suspend(void)
+{
+ return 0;
+}
+
+static inline void lockup_detector_resume(void)
+{
+}
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7deecb7bca5e..03e6257321f0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -13,6 +13,27 @@ struct mem_cgroup;
struct task_struct;
/*
+ * Details of the page allocation that triggered the oom killer that are used to
+ * determine what should be killed.
+ */
+struct oom_control {
+ /* Used to determine cpuset */
+ struct zonelist *zonelist;
+
+ /* Used to determine mempolicy */
+ nodemask_t *nodemask;
+
+ /* Used to determine cpuset and node locality requirement */
+ const gfp_t gfp_mask;
+
+ /*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+ const int order;
+};
+
+/*
* Types of limitations to the nodes from which allocations may occur
*/
enum oom_constraint {
@@ -57,21 +78,18 @@ extern unsigned long oom_badness(struct task_struct *p,
extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);
-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);
-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);
extern void exit_oom_victim(void);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 41c93844fb1d..4de152cd01b5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,10 @@ enum pageflags {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
PG_compound_lock,
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+ PG_young,
+ PG_idle,
+#endif
__NR_PAGEFLAGS,
/* Filesystems */
@@ -134,49 +138,68 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+/* Page flags policies wrt compound pages */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageTail(page), page); \
+ else \
+ page = compound_head(page); \
+ page;})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -205,47 +228,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+/* Forward declarations */
+struct page;
+static inline int PageCompound(struct page *page);
+static inline int PageTail(struct page *page);
+
+static inline struct page *compound_head_by_tail(struct page *tail)
+{
+ struct page *head = tail->first_page;
+
+ /*
+ * page->first_page may be a dangling pointer to an old
+ * compound page, so recheck that it is still a tail
+ * page before returning.
+ */
+ smp_rmb();
+ if (likely(PageTail(tail)))
+ return head;
+ return tail;
+}
+
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
+static inline struct page *compound_head(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return compound_head_by_tail(page);
+ return page;
+}
+
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return page->first_page;
+ return page;
+}
+
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND)
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND)
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -258,37 +334,46 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
@@ -311,6 +396,7 @@ PAGEFLAG_FALSE(HWPoison)
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
@@ -323,6 +409,7 @@ static inline int PageAnon(struct page *page)
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
@@ -334,8 +421,9 @@ u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -352,22 +440,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -396,8 +486,8 @@ static inline void set_page_writeback_keepwrite(struct page *page)
* and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
* and avoid handling those in real mode.
*/
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+__PAGEFLAG(Tail, tail, PF_ANY)
static inline int PageCompound(struct page *page)
{
@@ -421,8 +511,8 @@ static inline void ClearPageCompound(struct page *page)
* because PageCompound is always set for compound pages and not for
* pages on the LRU and/or pagecache.
*/
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound)
+TESTPAGEFLAG(Compound, compound, PF_ANY)
+__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY)
/*
* PG_reclaim is used in combination with PG_compound to mark the
@@ -518,21 +608,9 @@ static inline int PageTransTail(struct page *page)
}
#else
-
-static inline int PageTransHuge(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransCompound(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransTail(struct page *page)
-{
- return 0;
-}
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
#endif
/*
@@ -659,6 +737,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 2dc1e1697b45..047d64706f2a 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages);
-/*
- * Internal functions. Changes pageblock's migrate type.
- */
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
-void unset_migratetype_isolate(struct page *page, unsigned migratetype);
struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index c42981cd99aa..17f118a82854 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -26,6 +26,10 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+ PAGE_EXT_YOUNG,
+ PAGE_EXT_IDLE,
+#endif
};
/*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644
index 000000000000..bf268fa92c5b
--- /dev/null
+++ b/include/linux/page_idle.h
@@ -0,0 +1,110 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+ return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+ SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+ SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+ ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+ return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+ set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return test_and_clear_bit(PAGE_EXT_YOUNG,
+ &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+ set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+ clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+ return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+ return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+ return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a6c78e00ea96..3e95fb6a77af 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -657,17 +648,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/parse-integer.h b/include/linux/parse-integer.h
new file mode 100644
index 000000000000..ba620cdf3df6
--- /dev/null
+++ b/include/linux/parse-integer.h
@@ -0,0 +1,188 @@
+#ifndef _PARSE_INTEGER_H
+#define _PARSE_INTEGER_H
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/*
+ * int parse_integer(const char *s, unsigned int base, T *val);
+ *
+ * Convert integer string representation to an integer.
+ * Range of accepted values equals to that of type T.
+ *
+ * Conversion to unsigned integer accepts sign "+".
+ * Conversion to signed integer accepts sign "+" and sign "-".
+ *
+ * Radix 0 means autodetection: leading "0x" implies radix 16,
+ * leading "0" implies radix 8, otherwise radix is 10.
+ * Autodetection hint works after optional sign, but not before.
+ *
+ * Return number of characters parsed or -E.
+ *
+ * "T=char" case is not supported because -f{un,}signed-char can silently
+ * change range of accepted values.
+ */
+#define parse_integer(s, base, val) \
+({ \
+ const char *_s = (s); \
+ unsigned int _base = (base); \
+ typeof(&(val)[0]) _val = (val); \
+ \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), signed char *), \
+ _parse_integer_sc(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned char *), \
+ _parse_integer_uc(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), short *), \
+ _parse_integer_s(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned short *), \
+ _parse_integer_us(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), int *), \
+ _parse_integer_i(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned int *), \
+ _parse_integer_u(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long *) && sizeof(long) == 4,\
+ _parse_integer_i(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long *) && sizeof(long) == 8,\
+ _parse_integer_ll(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long *) && sizeof(unsigned long) == 4,\
+ _parse_integer_u(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long *) && sizeof(unsigned long) == 8,\
+ _parse_integer_ull(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), long long *), \
+ _parse_integer_ll(_s, _base, (void *)_val), \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_val), unsigned long long *),\
+ _parse_integer_ull(_s, _base, (void *)_val), \
+ _parse_integer_link_time_error())))))))))))); \
+})
+/* internal, do not use */
+int _parse_integer_sc(const char *s, unsigned int base, signed char *val);
+int _parse_integer_uc(const char *s, unsigned int base, unsigned char *val);
+int _parse_integer_s(const char *s, unsigned int base, short *val);
+int _parse_integer_us(const char *s, unsigned int base, unsigned short *val);
+int _parse_integer_i(const char *s, unsigned int base, int *val);
+int _parse_integer_u(const char *s, unsigned int base, unsigned int *val);
+int _parse_integer_ll(const char *s, unsigned int base, long long *val);
+int _parse_integer_ull(const char *s, unsigned int base, unsigned long long *val);
+void _parse_integer_link_time_error(void);
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
+#define PARSE_INTEGER_NEWLINE 0x80000000u
+
+/*
+ * Convert integer string representation terminated by \n\0 or \0 to an integer.
+ *
+ * Return 0 on success or -E.
+ *
+ * See parse_integer().
+ */
+static inline int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoll(const char *s, unsigned int base, long long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtoint(const char *s, unsigned int base, int *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
+{
+ return kstrtoull(s, base, res);
+}
+
+static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
+{
+ return kstrtoll(s, base, res);
+}
+
+static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
+{
+ return kstrtouint(s, base, res);
+}
+
+static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
+{
+ return kstrtoint(s, base, res);
+}
+
+static inline int __must_check kstrtou16(const char *s, unsigned int base, u16 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtos16(const char *s, unsigned int base, s16 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtou8(const char *s, unsigned int base, u8 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+static inline int __must_check kstrtos8(const char *s, unsigned int base, s8 *res)
+{
+ return parse_integer(s, base | PARSE_INTEGER_NEWLINE, res);
+}
+
+int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
+int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
+int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
+int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
+int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
+int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
+int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
+int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
+int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+
+static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
+{
+ return kstrtoull_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
+{
+ return kstrtoll_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
+{
+ return kstrtouint_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
+{
+ return kstrtoint_from_user(s, count, base, res);
+}
+#endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e68e8e4456b..88bee285b93d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1222,6 +1222,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
dma_pool_create(name, &pdev->dev, size, align, allocation)
#define pci_pool_destroy(pool) dma_pool_destroy(pool)
#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
+#define pci_pool_zalloc(pool, flags, handle) \
+ dma_pool_zalloc(pool, flags, handle)
#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
struct msix_entry {
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2a..4a27153574e2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -19,19 +19,23 @@
* under normal circumstances, used to verify that nobody uses
* non-initialized list entries.
*/
-#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA)
+#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA)
/********** include/linux/timer.h **********/
/*
* Magic number "tsta" to indicate a static timer initializer
* for the object debugging code.
*/
-#define TIMER_ENTRY_STATIC ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
@@ -69,10 +73,6 @@
#define ATM_POISON_FREE 0x12
#define ATM_POISON 0xdeadbeef
-/********** net/ **********/
-#define NEIGHBOR_DEAD 0xdeadbeef
-#define NETFILTER_LINK_POISON 0xdead57ac
-
/********** kernel/mutexes **********/
#define MUTEX_DEBUG_INIT 0x11
#define MUTEX_DEBUG_FREE 0x22
@@ -83,7 +83,4 @@
/********** security/ **********/
#define KEY_DESTROY 0xbd
-/********** sound/oss/ **********/
-#define OSS_POISON_FREE 0xAB
-
#endif
diff --git a/include/linux/printk.h b/include/linux/printk.h
index a6298b27ac99..9729565c25ff 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -404,10 +404,10 @@ do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \
if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \
__ratelimit(&_rs)) \
- __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__); \
+ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...) \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
groupsize, buf, len, ascii) \
dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii) \
print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+}
+#endif
#endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..0860336c6c40 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,10 +85,14 @@ enum ttu_flags {
TTU_UNMAP = 1, /* unmap mode */
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
+ TTU_FREE = 8, /* free mode */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ * and caller guarantees they will
+ * do a final flush if necessary */
};
#ifdef CONFIG_MMU
@@ -183,7 +187,8 @@ static inline void page_dup_rmap(struct page *page)
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
+ struct mem_cgroup *memcg, unsigned long *vm_flags,
+ int *is_pte_dirty);
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
@@ -260,9 +265,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
static inline int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
return 0;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,25 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ */
+ struct cpumask cpumask;
+
+ /* True if any bit in cpumask is set */
+ bool flush_required;
+
+ /*
+ * If true then the PTE was dirty when unmapped. The entry must be
+ * flushed before IO is initiated or a stale TLB entry potentially
+ * allows an update without redirtying the page.
+ */
+ bool writable;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1700,6 +1719,10 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
struct rcu_head rcu;
/*
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..adeadbd6d7bf 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -122,6 +122,10 @@ int seq_write(struct seq_file *seq, const void *data, size_t len);
__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii);
+
int seq_path(struct seq_file *, const struct path *, const char *);
int seq_file_path(struct seq_file *, struct file *, const char *);
int seq_dentry(struct seq_file *, struct dentry *, const char *);
@@ -149,6 +153,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
#endif
}
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+ const char *value)
+{
+ seq_putc(m, ',');
+ seq_escape(m, name, ",= \t\n\\");
+ if (value) {
+ seq_putc(m, '=');
+ seq_escape(m, value, ", \t\n\\");
+ }
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ * where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) { \
+ char val_buf[length + 1]; \
+ strncpy(val_buf, value, length); \
+ val_buf[length] = '\0'; \
+ seq_show_option(m, name, val_buf); \
+}
+
#define SEQ_START_TOKEN ((void *)1)
/*
* Helpers for iteration over list_head-s in seq_files
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
void kmem_cache_free(struct kmem_cache *, void *);
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
const char *thread_comm;
};
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ return smpboot_register_percpu_thread_cpumask(plug_thread,
+ cpu_possible_mask);
+}
+
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *);
diff --git a/include/linux/string.h b/include/linux/string.h
index a8d90db9c4b0..d5dfe3e75572 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -118,6 +118,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 71f711db4500..dabe643eb5fa 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
#define ESCAPE_HEX 0x20
int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
- unsigned int flags, const char *esc);
+ unsigned int flags, const char *only);
static inline int string_escape_mem_any_np(const char *src, size_t isz,
- char *dst, size_t osz, const char *esc)
+ char *dst, size_t osz, const char *only)
{
- return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
+ return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
}
static inline int string_escape_str(const char *src, char *dst, size_t sz,
- unsigned int flags, const char *esc)
+ unsigned int flags, const char *only)
{
- return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
+ return string_escape_mem(src, strlen(src), dst, sz, flags, only);
}
static inline int string_escape_str_any_np(const char *src, char *dst,
- size_t sz, const char *esc)
+ size_t sz, const char *only)
{
- return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
+ return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
}
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 31496d201fdc..b0f9d6a7a799 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -154,7 +154,7 @@ enum {
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
};
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
@@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
@@ -351,7 +352,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
extern int kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+ /* root ? */
+ if (mem_cgroup_disabled() || !memcg->css.parent)
+ return vm_swappiness;
+
+ return memcg->swappiness;
+}
+
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -398,6 +407,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
extern struct page *lookup_swap_cache(swp_entry_t);
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
+extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated);
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
@@ -431,6 +443,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
@@ -522,6 +535,11 @@ static inline int page_swapcount(struct page *page)
return 0;
}
+static inline int swp_swapcount(swp_entry_t entry)
+{
+ return 0;
+}
+
#define reuse_swap_page(page) (page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cedf3d3c373f..5c3a5f3e7eec 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry)
#endif
#ifdef CONFIG_MEMORY_FAILURE
+
+extern atomic_long_t num_poisoned_pages __read_mostly;
+
/*
* Support for hardware poisoned pages
*/
@@ -177,6 +180,31 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_HWPOISON;
}
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+ return TestSetPageHWPoison(page);
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+ atomic_long_inc(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_dec(void)
+{
+ atomic_long_dec(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_add(long num)
+{
+ atomic_long_add(num, &num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_sub(long num)
+{
+ atomic_long_sub(num, &num_poisoned_pages);
+}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -188,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+ return false;
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+}
#endif
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..890632cbf353 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
@@ -884,4 +885,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp, int flags);
+asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
+
#endif
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..587480ad41b7
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/userfaultfd_k.h
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason);
+
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9246d32dc973..2b1cef88b827 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
+ PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
typedef int wait_bit_action_f(struct wait_bit_key *);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
#define wake_up_locked_poll(x, m) \
- __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+ __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
#define wake_up_interruptible_sync_poll(x, m) \
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
extern int watchdog_register_device(struct watchdog_device *);
extern void watchdog_unregister_device(struct watchdog_device *);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
#endif /* ifndef _LINUX_WATCHDOG_H */
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index f9d41a6e361f..e183a0a65ac1 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -9,7 +9,7 @@ struct zbud_ops {
int (*evict)(struct zbud_pool *pool, unsigned long handle);
};
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
void zbud_destroy_pool(struct zbud_pool *pool);
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index d30eff3d84d5..42f8ec992452 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,8 +36,10 @@ enum zpool_mapmode {
ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
};
+bool zpool_has_pool(char *type);
+
struct zpool *zpool_create_pool(char *type, char *name,
- gfp_t gfp, struct zpool_ops *ops);
+ gfp_t gfp, const struct zpool_ops *ops);
char *zpool_get_type(struct zpool *pool);
@@ -81,7 +83,7 @@ struct zpool_driver {
atomic_t refcount;
struct list_head list;
- void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+ void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
struct zpool *zpool);
void (*destroy)(void *pool);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 1338190b5478..6398dfae53f1 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -34,6 +34,11 @@ enum zs_mapmode {
*/
};
+struct zs_pool_stats {
+ /* How many pages were migrated (freed) */
+ unsigned long pages_compacted;
+};
+
struct zs_pool;
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
@@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
unsigned long zs_get_total_pages(struct zs_pool *pool);
unsigned long zs_compact(struct zs_pool *pool);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 43c6abcf06ab..7aa78440559a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1042,42 +1042,9 @@ struct proto {
#endif
};
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
- /* Currently active and new sockets should be assigned to cgroups */
- MEMCG_SOCK_ACTIVE,
- /* It was ever activated; we must disarm static keys on destruction */
- MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
- struct page_counter memory_allocated; /* Current allocated memory. */
- struct percpu_counter sockets_allocated; /* Current number of sockets. */
- int memory_pressure;
- long sysctl_mem[3];
- unsigned long flags;
- /*
- * memcg field is used to find which memcg we belong directly
- * Each memcg struct can hold more than one cg_proto, so container_of
- * won't really cut.
- *
- * The elegant solution would be having an inverse function to
- * proto_cgroup in struct proto, but that means polluting the structure
- * for everybody, instead of just for memcg users.
- */
- struct mem_cgroup *memcg;
-};
-
int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
-static inline bool memcg_proto_active(struct cg_proto *cg_proto)
-{
- return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-}
-
#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \
EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \
EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \
- EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" )
+ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \
+ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" )
/*
* First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ddc3b36f1046..0e821e3c3d45 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -25,6 +25,11 @@
# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
#endif
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
+
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */
@@ -34,6 +39,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index e9fe6fd2a074..7162cd4cca73 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -17,5 +17,6 @@
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
#endif /* __ASM_GENERIC_MMAN_H */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index e016bd9b1a04..14a6013cbdac 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_mlock2 282
+__SYSCALL(__NR_mlock2, sys_mlock2)
#undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
/*
* All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 2a70847ce79c..ba94b9bb7bee 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -457,3 +457,4 @@ header-y += xfrm.h
header-y += xilinx-v4l2-controls.h
header-y += zorro.h
header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index a6c4962e5d46..5da5f8751ce7 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -33,6 +33,7 @@
#define KPF_THP 22
#define KPF_BALLOON 23
#define KPF_ZERO_PAGE 24
+#define KPF_IDLE 25
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
#define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS))
#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE 6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+ (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \
issecure_mask(SECURE_NO_SETUID_FIXUP) | \
- issecure_mask(SECURE_KEEP_CAPS))
+ issecure_mask(SECURE_KEEP_CAPS) | \
+ issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
#endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..df0e09bb7dd5
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#include <linux/compiler.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ * UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ } pagefault;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK 0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#endif
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * There will be a wrprotection flag later that allows to map
+ * pages wrprotected on the fly. And such a flag will be
+ * available if the wrprotection ioctl are implemented for the
+ * range according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index 26afe06cad3e..88ee0fe59048 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,6 +526,7 @@ config RCU_EXPERT
config SRCU
bool
+ default y
help
This option selects the sleepable version of RCU. This version
permits arbitrary sleeping or blocking within RCU read-side critical
@@ -896,6 +897,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
bool
#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ bool
+
+#
# For architectures that know their GCC __int128 support is sound
#
config ARCH_SUPPORTS_INT128
@@ -1659,6 +1670,17 @@ config ADVISE_SYSCALLS
applications use these syscalls, you can disable this option to save
space.
+config USERFAULTFD
+ bool "Enable userfaultfd() system call"
+ select ANON_INODES
+ default y
+ depends on MMU
+ help
+ Enable the userfaultfd() system call that allows to intercept and
+ handle page faults in userland.
+
+ If unsure, say Y.
+
config PCI_QUIRKS
default y
bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..b32ad7d97ac9 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
static void __init free_initrd(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
#endif
if (do_retain_initrd)
goto skip;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
diff --git a/init/main.c b/init/main.c
index 56506553d4d8..9e64d7097f1a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
static void __init do_basic_setup(void)
{
cpuset_init_smp();
- usermodehelper_init();
shmem_init();
driver_init();
init_irq_proc();
diff --git a/ipc/msg.c b/ipc/msg.c
index 66c4f567eb73..f675689290ca 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 2b491590ebab..71f448e5e927 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
size_t len = src->m_ts;
size_t alen;
- BUG_ON(dst == NULL);
+ WARN_ON(dst == NULL);
if (src->m_ts > dst->m_ts)
return ERR_PTR(-EINVAL);
diff --git a/ipc/shm.c b/ipc/shm.c
index 4aef24d91b63..222131e8e38f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
* We raced in the idr lookup or with shm_destroy(). Either way, the
* ID is busted.
*/
- BUG_ON(IS_ERR(ipcp));
+ WARN_ON(IS_ERR(ipcp));
return container_of(ipcp, struct shmid_kernel, shm_perm);
}
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
- BUG_ON(!sfd->vm_ops->fault);
+ WARN_ON(!sfd->vm_ops->fault);
#endif
vma->vm_ops = &shm_vm_ops;
shm_open(vma);
diff --git a/kernel/Makefile b/kernel/Makefile
index c8b2b4fce988..0edb1d10b800 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,7 +50,9 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->legacy_name);
+ seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
+ seq_show_option(seq, "name", root->name);
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e8183895691c..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
-#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..615a347a9715 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED | VM_UFFD_MISSING | VM_UFFD_WP |
+ VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,22 @@
/*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
-
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
#include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in do_exit() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses. On processors
- * where you can disable the MMU this is trivial, and easy. For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place. This means I can only support memory whose
- * physical address can fit in an unsigned long. In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages. As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it). The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- * - allocating a page table with the control code buffer identity
- * mapped, to simplify machine_kexec and make kexec_on_panic more
- * reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long dest);
+#include "kexec_internal.h"
static int copy_user_segment_list(struct kimage *image,
unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
return ret;
}
-static int sanity_check_segment_list(struct kimage *image)
-{
- int result, i;
- unsigned long nr_segments = image->nr_segments;
-
- /*
- * Verify we have good destination addresses. The caller is
- * responsible for making certain we don't attempt to load
- * the new image into invalid or reserved areas of RAM. This
- * just verifies it is an address we can use.
- *
- * Since the kernel does everything in page size chunks ensure
- * the destination addresses are page aligned. Too many
- * special cases crop of when we don't do this. The most
- * insidious is getting overlapping destination addresses
- * simply because addresses are changed to page size
- * granularity.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
- if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
- }
-
- /* Verify our destination addresses do not overlap.
- * If we alloed overlapping destination addresses
- * through very weird things can happen with no
- * easy explanation as one segment stops on another.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
- unsigned long j;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- for (j = 0; j < i; j++) {
- unsigned long pstart, pend;
- pstart = image->segment[j].mem;
- pend = pstart + image->segment[j].memsz;
- /* Do the segments overlap ? */
- if ((mend > pstart) && (mstart < pend))
- return result;
- }
- }
-
- /* Ensure our buffer sizes are strictly less than
- * our memory sizes. This should always be the case,
- * and it is easier to check up front than to be surprised
- * later on.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
- }
-
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
-
- if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
- }
- }
-
- return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
- struct kimage *image;
-
- /* Allocate a controlling structure */
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- return NULL;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unusable_pages);
-
- return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments,
@@ -354,873 +101,6 @@ out_free_image:
return ret;
}
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
- struct fd f = fdget(fd);
- int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
- struct purgatory_info *pi = &image->purgatory_info;
-
- vfree(image->kernel_buf);
- image->kernel_buf = NULL;
-
- vfree(image->initrd_buf);
- image->initrd_buf = NULL;
-
- kfree(image->cmdline_buf);
- image->cmdline_buf = NULL;
-
- vfree(pi->purgatory_buf);
- pi->purgatory_buf = NULL;
-
- vfree(pi->sechdrs);
- pi->sechdrs = NULL;
-
- /* See if architecture has anything to cleanup post load */
- arch_kimage_file_post_load_cleanup(image);
-
- /*
- * Above call should have called into bootloader to free up
- * any data stored in kimage->image_loader_data. It should
- * be ok now to free it up.
- */
- kfree(image->image_loader_data);
- image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
- const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned flags)
-{
- int ret = 0;
- void *ldata;
-
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
- if (ret)
- return ret;
-
- /* Call arch image probe handlers */
- ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
- image->kernel_buf_len);
-
- if (ret)
- goto out;
-
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
- goto out;
- }
- pr_debug("kernel signature verification successful.\n");
-#endif
- /* It is possible that there no initramfs is being loaded */
- if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
- if (ret)
- goto out;
- }
-
- if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- image->cmdline_buf_len = cmdline_len;
-
- /* command line should be a string with last byte null */
- if (image->cmdline_buf[cmdline_len - 1] != '\0') {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
-
- if (IS_ERR(ldata)) {
- ret = PTR_ERR(ldata);
- goto out;
- }
-
- image->image_loader_data = ldata;
-out:
- /* In case of error, free up all allocated memory in this function */
- if (ret)
- kimage_file_post_load_cleanup(image);
- return ret;
-}
-
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
- int initrd_fd, const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned long flags)
-{
- int ret;
- struct kimage *image;
- bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
- image = do_kimage_alloc_init();
- if (!image)
- return -ENOMEM;
-
- image->file_mode = 1;
-
- if (kexec_on_panic) {
- /* Enable special crash kernel control page alloc policy. */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
- }
-
- ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
- cmdline_ptr, cmdline_len, flags);
- if (ret)
- goto out_free_image;
-
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_post_load_bufs;
-
- ret = -ENOMEM;
- image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_PAGE_SIZE));
- if (!image->control_code_page) {
- pr_err("Could not allocate control_code_buffer\n");
- goto out_free_post_load_bufs;
- }
-
- if (!kexec_on_panic) {
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free_control_pages;
- }
- }
-
- *rimage = image;
- return 0;
-out_free_control_pages:
- kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
- kimage_file_post_load_cleanup(image);
-out_free_image:
- kfree(image);
- return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start,
- unsigned long end)
-{
- unsigned long i;
-
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
- return 1;
- }
-
- return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *pages;
-
- pages = alloc_pages(gfp_mask, order);
- if (pages) {
- unsigned int count, i;
- pages->mapping = NULL;
- set_page_private(pages, order);
- count = 1 << order;
- for (i = 0; i < count; i++)
- SetPageReserved(pages + i);
- }
-
- return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
- unsigned int order, count, i;
-
- order = page_private(page);
- count = 1 << order;
- for (i = 0; i < count; i++)
- ClearPageReserved(page + i);
- __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
- struct list_head *pos, *next;
-
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
- list_del(&page->lru);
- kimage_free_pages(page);
- }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * At worst this runs in O(N) of the image size.
- */
- struct list_head extra_pages;
- struct page *pages;
- unsigned int count;
-
- count = 1 << order;
- INIT_LIST_HEAD(&extra_pages);
-
- /* Loop while I can allocate a page and the page allocated
- * is a destination page.
- */
- do {
- unsigned long pfn, epfn, addr, eaddr;
-
- pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
- if (!pages)
- break;
- pfn = page_to_pfn(pages);
- epfn = pfn + count;
- addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
- if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr)) {
- list_add(&pages->lru, &extra_pages);
- pages = NULL;
- }
- } while (!pages);
-
- if (pages) {
- /* Remember the allocated page... */
- list_add(&pages->lru, &image->control_pages);
-
- /* Because the page is already in it's destination
- * location we will never allocate another page at
- * that address. Therefore kimage_alloc_pages
- * will not return it (again) and we don't need
- * to give it an entry in image->segment[].
- */
- }
- /* Deal with the destination pages I have inadvertently allocated.
- *
- * Ideally I would convert multi-page allocations into single
- * page allocations, and add everything to image->dest_pages.
- *
- * For now it is simpler to just free the pages.
- */
- kimage_free_page_list(&extra_pages);
-
- return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * Control pages are also the only pags we must allocate
- * when loading a crash kernel. All of the other pages
- * are specified by the segments and we just memcpy
- * into them directly.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * Given the low demand this implements a very simple
- * allocator that finds the first hole of the appropriate
- * size in the reserved memory region, and allocates all
- * of the memory up to and including the hole.
- */
- unsigned long hole_start, hole_end, size;
- struct page *pages;
-
- pages = NULL;
- size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
- unsigned long i;
-
- if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
- break;
- /* See if I overlap any of the segments */
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- if ((hole_end >= mstart) && (hole_start <= mend)) {
- /* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- break;
- }
- }
- /* If I don't overlap any segments I have found my hole! */
- if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- break;
- }
- }
- if (pages)
- image->control_page = hole_end;
-
- return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
- unsigned int order)
-{
- struct page *pages = NULL;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- pages = kimage_alloc_normal_control_pages(image, order);
- break;
- case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
- break;
- }
-
- return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
- if (*image->entry != 0)
- image->entry++;
-
- if (image->entry == image->last_entry) {
- kimage_entry_t *ind_page;
- struct page *page;
-
- page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page)
- return -ENOMEM;
-
- ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
- image->entry = ind_page;
- image->last_entry = ind_page +
- ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
- }
- *image->entry = entry;
- image->entry++;
- *image->entry = 0;
-
- return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
- unsigned long destination)
-{
- int result;
-
- destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
-
- return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
- int result;
-
- page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
-
- return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
- /* Walk through and free any extra destination pages I may have */
- kimage_free_page_list(&image->dest_pages);
-
- /* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
- if (*image->entry != 0)
- image->entry++;
-
- *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
- for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
- struct page *page;
-
- page = pfn_to_page(entry >> PAGE_SHIFT);
- kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
- kimage_entry_t *ptr, entry;
- kimage_entry_t ind = 0;
-
- if (!image)
- return;
-
- kimage_free_extra_pages(image);
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_INDIRECTION) {
- /* Free the previous indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
- /* Save this indirection page until we are
- * done with it.
- */
- ind = entry;
- } else if (entry & IND_SOURCE)
- kimage_free_entry(entry);
- }
- /* Free the final indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
-
- /* Handle any machine specific cleanup */
- machine_kexec_cleanup(image);
-
- /* Free the kexec control pages... */
- kimage_free_page_list(&image->control_pages);
-
- /*
- * Free up any temporary buffers allocated. This might hit if
- * error occurred much later after buffer allocation.
- */
- if (image->file_mode)
- kimage_file_post_load_cleanup(image);
-
- kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
- unsigned long page)
-{
- kimage_entry_t *ptr, entry;
- unsigned long destination = 0;
-
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION)
- destination = entry & PAGE_MASK;
- else if (entry & IND_SOURCE) {
- if (page == destination)
- return ptr;
- destination += PAGE_SIZE;
- }
- }
-
- return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long destination)
-{
- /*
- * Here we implement safeguards to ensure that a source page
- * is not copied to its destination page before the data on
- * the destination page is no longer useful.
- *
- * To do this we maintain the invariant that a source page is
- * either its own destination page, or it is not a
- * destination page at all.
- *
- * That is slightly stronger than required, but the proof
- * that no problems will not occur is trivial, and the
- * implementation is simply to verify.
- *
- * When allocating all pages normally this algorithm will run
- * in O(N) time, but in the worst case it will run in O(N^2)
- * time. If the runtime is a problem the data structures can
- * be fixed.
- */
- struct page *page;
- unsigned long addr;
-
- /*
- * Walk through the list of destination pages, and see if I
- * have a match.
- */
- list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
- if (addr == destination) {
- list_del(&page->lru);
- return page;
- }
- }
- page = NULL;
- while (1) {
- kimage_entry_t *old;
-
- /* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
- if (!page)
- return NULL;
- /* If the page cannot be used file it away */
- if (page_to_pfn(page) >
- (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unusable_pages);
- continue;
- }
- addr = page_to_pfn(page) << PAGE_SHIFT;
-
- /* If it is the destination page we want use it */
- if (addr == destination)
- break;
-
- /* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
- break;
-
- /*
- * I know that the page is someones destination page.
- * See if there is already a source page for this
- * destination page. And if so swap the source pages.
- */
- old = kimage_dst_used(image, addr);
- if (old) {
- /* If so move it */
- unsigned long old_addr;
- struct page *old_page;
-
- old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
- copy_highpage(page, old_page);
- *old = addr | (*old & ~PAGE_MASK);
-
- /* The old page I have found cannot be a
- * destination page, so return it if it's
- * gfp_flags honor the ones passed in.
- */
- if (!(gfp_mask & __GFP_HIGHMEM) &&
- PageHighMem(old_page)) {
- kimage_free_pages(old_page);
- continue;
- }
- addr = old_addr;
- page = old_page;
- break;
- } else {
- /* Place the page on the destination list I
- * will use it later.
- */
- list_add(&page->lru, &image->dest_pages);
- }
- }
-
- return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
-
- result = kimage_set_destination(image, maddr);
- if (result < 0)
- goto out;
-
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- result = kimage_add_page(image, page_to_pfn(page)
- << PAGE_SHIFT);
- if (result < 0)
- goto out;
-
- ptr = kmap(page);
- /* Start with a clear page */
- clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
- * We do things a page at a time for the sake of kmap.
- */
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
- if (mchunk > uchunk) {
- /* Zero the trailing part of the page */
- memset(ptr + uchunk, 0, mchunk - uchunk);
- }
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kexec_flush_icache_page(page);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- int result = -ENOMEM;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
- break;
- case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
- break;
- }
-
- return result;
-}
-
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +215,6 @@ out:
return result;
}
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, nr_segments,
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
- unsigned long, cmdline_len, const char __user *, cmdline_ptr,
- unsigned long, flags)
-{
- int ret = 0, i;
- struct kimage **dest_image, *image;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
- return -EPERM;
-
- /* Make sure we have a legal set of flags */
- if (flags != (flags & KEXEC_FILE_FLAGS))
- return -EINVAL;
-
- image = NULL;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
- dest_image = &kexec_crash_image;
-
- if (flags & KEXEC_FILE_UNLOAD)
- goto exchange;
-
- /*
- * In case of crash, new kernel gets loaded in reserved region. It is
- * same memory where old crash kernel might be loaded. Free any
- * current crash dump kernel before we corrupt it.
- */
- if (flags & KEXEC_FILE_ON_CRASH)
- kimage_free(xchg(&kexec_crash_image, NULL));
-
- ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
- cmdline_len, flags);
- if (ret)
- goto out;
-
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
-
- ret = kexec_calculate_store_digests(image);
- if (ret)
- goto out;
-
- for (i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
-
- ret = kimage_load_segment(image, &image->segment[i]);
- if (ret)
- goto out;
- }
-
- kimage_terminate(image);
-
- /*
- * Free up any temporary buffers allocated which are not needed
- * after image has been loaded
- */
- kimage_file_post_load_cleanup(image);
-exchange:
- image = xchg(dest_image, image);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
- return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_map_reserved_pages();
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
-
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- crash_notes = alloc_percpu(note_buf_t);
- if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
-
- do {
- /* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
-
- if (temp_start < start || temp_start < kbuf->buf_min)
- return 0;
-
- temp_end = temp_start + kbuf->memsz - 1;
-
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start - PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_start = max(start, kbuf->buf_min);
-
- do {
- temp_start = ALIGN(temp_start, kbuf->buf_align);
- temp_end = temp_start + kbuf->memsz - 1;
-
- if (temp_end > end || temp_end > kbuf->buf_max)
- return 0;
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start + PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
- struct kexec_buf *kbuf = (struct kexec_buf *)arg;
- unsigned long sz = end - start + 1;
-
- /* Returning 0 will take to next memory range */
- if (sz < kbuf->memsz)
- return 0;
-
- if (end < kbuf->buf_min || start > kbuf->buf_max)
- return 0;
-
- /*
- * Allocate memory top down with-in ram range. Otherwise bottom up
- * allocation.
- */
- if (kbuf->top_down)
- return locate_mem_hole_top_down(start, end, kbuf);
- return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
-{
-
- struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
- int ret;
-
- /* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
- return -EINVAL;
-
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
- /*
- * Make sure we are not trying to add buffer after allocating
- * control pages. All segments need to be placed first before
- * any control pages are allocated. As control page allocation
- * logic goes through list of segments to make sure there are
- * no destination overlaps.
- */
- if (!list_empty(&image->control_pages)) {
- WARN_ON(1);
- return -EINVAL;
- }
-
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
-
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
-
- /* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = kbuf->buffer;
- ksegment->bufsz = kbuf->bufsz;
- ksegment->mem = kbuf->mem;
- ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
- return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
- void *zero_buf;
- struct kexec_sha_region *sha_regions;
- struct purgatory_info *pi = &image->purgatory_info;
-
- zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
- zero_buf_sz = PAGE_SIZE;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
- sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
- sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
- goto out_free_desc;
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
-
- for (j = i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- /*
- * Skip purgatory as it will be modified once we put digest
- * info in purgatory.
- */
- if (ksegment->kbuf == pi->purgatory_buf)
- continue;
-
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
-
- /*
- * Assume rest of the buffer is filled with zero and
- * update digest accordingly.
- */
- nullsz = ksegment->memsz - ksegment->bufsz;
- while (nullsz) {
- unsigned long bytes = nullsz;
-
- if (bytes > zero_buf_sz)
- bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
- nullsz -= bytes;
- }
-
- if (ret)
- break;
-
- sha_regions[j].start = ksegment->mem;
- sha_regions[j].len = ksegment->memsz;
- j++;
- }
-
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
-
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
-
-out_free_digest:
- kfree(digest);
-out_free_sha_regions:
- vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
- return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
-
- /* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
- bss_align = 1;
- buf_sz = 0;
- bss_sz = 0;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
- } else {
- /* bss section */
- if (bss_align < align)
- bss_align = align;
- bss_sz = ALIGN(bss_sz, align);
- bss_sz += sechdrs[i].sh_size;
- }
- }
-
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
- memsz = buf_sz + bss_pad + bss_sz;
-
- /* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (buf_align < bss_align)
- buf_align = bss_align;
-
- /* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
- if (ret)
- goto out;
-
- /* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
- bss_addr = ALIGN(bss_addr, align);
- sechdrs[i].sh_addr = bss_addr;
- bss_addr += sechdrs[i].sh_size;
- }
- }
-
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
-
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
-
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
-
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = purgatory_buf;
- return ret;
-out:
- vfree(sechdrs);
- vfree(purgatory_buf);
- return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
- int i, ret;
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
-
- /* Apply relocations */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
-
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
- continue;
-
- /*
- * For section of type SHT_RELA/SHT_REL,
- * ->sh_link contains section header index of associated
- * symbol table. And ->sh_info contains section header
- * index of section to which relocations apply.
- */
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
- return -ENOEXEC;
-
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
-
- if (!(section->sh_flags & SHF_ALLOC))
- continue;
-
- /*
- * symtab->sh_link contain section header index of associated
- * string table.
- */
- if (symtab->sh_link >= pi->ehdr->e_shnum)
- /* Invalid section number? */
- continue;
-
- /*
- * Respective architecture needs to provide support for applying
- * relocations of type SHT_RELA/SHT_REL.
- */
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- int ret;
-
- if (kexec_purgatory_size <= 0)
- return -EINVAL;
-
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
-
- ret = __kexec_load_purgatory(image, min, max, top_down);
- if (ret)
- return ret;
-
- ret = kexec_apply_relocations(image);
- if (ret)
- goto out;
-
- *load_addr = pi->purgatory_load_addr;
- return 0;
-out:
- vfree(pi->sechdrs);
- vfree(pi->purgatory_buf);
- return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
- const char *strtab;
-
- if (!pi->sechdrs || !pi->ehdr)
- return NULL;
-
- sechdrs = pi->sechdrs;
- ehdr = pi->ehdr;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
- Elf_Shdr *sechdr;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return ERR_PTR(-EINVAL);
-
- sechdr = &pi->sechdrs[sym->st_shndx];
-
- /*
- * Returns the address where symbol will finally be loaded after
- * kexec_load_segment()
- */
- return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
- void *buf, unsigned int size, bool get_value)
-{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
- struct purgatory_info *pi = &image->purgatory_info;
- char *sym_buf;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return -EINVAL;
-
- if (sym->st_size != size) {
- pr_err("symbol %s size mismatch: expected %lu actual %u\n",
- name, (unsigned long)sym->st_size, size);
- return -EINVAL;
- }
-
- sechdrs = pi->sechdrs;
-
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- pr_err("symbol %s is in a bss section. Cannot %s\n", name,
- get_value ? "get" : "set");
- return -EINVAL;
- }
-
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
-
- if (get_value)
- memcpy((void *)buf, sym_buf, size);
- else
- memcpy((void *)sym_buf, buf, size);
-
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
- int error = 0;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
- if (!kexec_image) {
- error = -EINVAL;
- goto Unlock;
- }
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- lock_system_sleep();
- pm_prepare_console();
- error = freeze_processes();
- if (error) {
- error = -EBUSY;
- goto Restore_console;
- }
- suspend_console();
- error = dpm_suspend_start(PMSG_FREEZE);
- if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
- */
- error = dpm_suspend_end(PMSG_FREEZE);
- if (error)
- goto Resume_devices;
- error = disable_nonboot_cpus();
- if (error)
- goto Enable_cpus;
- local_irq_disable();
- error = syscore_suspend();
- if (error)
- goto Enable_irqs;
- } else
-#endif
- {
- kexec_in_progress = true;
- kernel_restart_prepare(NULL);
- migrate_to_reboot_cpu();
-
- /*
- * migrate_to_reboot_cpu() disables CPU hotplug assuming that
- * no further code needs to use CPU hotplug (which is true in
- * the reboot case). However, the kexec path depends on using
- * CPU hotplug again; so re-enable it here.
- */
- cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
- machine_shutdown();
- }
-
- machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- syscore_resume();
- Enable_irqs:
- local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
- dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
- dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
- thaw_processes();
- Restore_console:
- pm_restore_console();
- unlock_system_sleep();
- }
-#endif
-
- Unlock:
- mutex_unlock(&kexec_mutex);
- return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..201b45327804
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return result;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return result;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return result;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return result;
+ }
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ image->control_page = hole_end;
+ break;
+ }
+ }
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ /* Place the page on the destination list, to be used later */
+ list_add(&page->lru, &image->dest_pages);
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+size_t crash_get_memory_size(void)
+{
+ size_t size = 0;
+
+ mutex_lock(&kexec_mutex);
+ if (crashk_res.end != crashk_res.start)
+ size = resource_size(&crashk_res);
+ mutex_unlock(&kexec_mutex);
+ return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
+
+ mutex_lock(&kexec_mutex);
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ start = crashk_res.start;
+ end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+ end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+ crash_map_reserved_pages();
+ crash_free_reserved_phys_range(end, crashk_res.end);
+
+ if ((start == end) && (crashk_res.parent != NULL))
+ release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
+ crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
+ crash_unmap_reserved_pages();
+
+unlock:
+ mutex_unlock(&kexec_mutex);
+ return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ lock_system_sleep();
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare(NULL);
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ unlock_system_sleep();
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..da98d0593de2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
extern int max_threads;
-static struct workqueue_struct *khelper_wq;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -114,10 +112,11 @@ out:
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
/*
* This is the task which runs the usermode application
*/
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
{
struct subprocess_info *sub_info = data;
struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
flush_signal_handlers(current, 1);
spin_unlock_irq(&current->sighand->siglock);
- /* We can run anywhere, unlike our parent keventd(). */
- set_cpus_allowed_ptr(current, cpu_all_mask);
-
/*
- * Our parent is keventd, which runs with elevated scheduling priority.
- * Avoid propagating that into the userspace child.
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
(const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
- /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
@@ -266,15 +265,14 @@ out:
do_exit(0);
}
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
- struct subprocess_info *sub_info = data;
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
- * But wait_for_helper() always runs as keventd, and put_user()
- * to a kernel address works OK for kernel threads, due to their
- * having an mm_segment_t which spans the entire address space.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *)&ret, 0, NULL);
/*
- * If ret is 0, either ____call_usermodehelper failed and the
- * real error code is already in sub_info->retval or
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
* sub_info->retval is 0 anyway, so don't mess with it then.
*/
if (ret)
sub_info->retval = ret;
}
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
umh_complete(sub_info);
- do_exit(0);
}
-/* This is run by khelper thread */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- pid_t pid;
- if (sub_info->wait & UMH_WAIT_PROC)
- pid = kernel_thread(wait_for_helper, sub_info,
- CLONE_FS | CLONE_FILES | SIGCHLD);
- else
- pid = kernel_thread(____call_usermodehelper, sub_info,
- SIGCHLD);
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
}
}
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
if (!sub_info)
goto out;
- INIT_WORK(&sub_info->work, __call_usermodehelper);
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
* from interrupt context.
*
* Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
return -EINVAL;
}
helper_lock();
- if (!khelper_wq || usermodehelper_disabled) {
+ if (usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
- queue_work(khelper_wq, &sub_info->work);
+ queue_work(system_unbound_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
},
{ }
};
-
-void __init usermodehelper_init(void)
-{
- khelper_wq = create_singlethread_workqueue("khelper");
- BUG_ON(!khelper_wq);
-}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
- /* Unpark any threads that were voluntarily parked. */
- for_each_cpu_not(cpu, ht->cpumask) {
- if (cpu_online(cpu)) {
- struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
- kthread_unpark(tsk);
- }
- }
-
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+ cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -311,7 +308,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..8de5b2645796 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
cond_syscall(sys_munlock);
cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
@@ -219,6 +220,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
{
.procname = "kexec_load_disabled",
.data = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
+#include <linux/kthread.h>
/*
* The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
}
-void watchdog_nmi_enable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_enable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!watchdog_running)
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_disable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
- int ret;
+ int cpu, ret = 0;
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu) {
+ ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+ if (ret)
+ break;
+ }
+ if (ret) {
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
+ put_online_cpus();
+
+ return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ mutex_lock(&watchdog_proc_mutex);
/*
- * No need to cancel and restart hrtimer if it is currently executing
- * because it will reprogram itself with the new period now.
- * We should never see it unqueued here because we are running per-cpu
- * with interrupts disabled.
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
*/
- ret = hrtimer_try_to_cancel(hrtimer);
- if (ret == 1)
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
}
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
/*
- * Make sure that perf event counter will adopt to a new
- * sampling period. Updating the sampling period directly would
- * be much nicer but we do not have an API for that now so
- * let's use a big hammer.
- * Hrtimer will adopt the new period on the next tick but this
- * might be late already so we have to restart the timer as well.
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
*/
- watchdog_nmi_disable(cpu);
- smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
- watchdog_nmi_enable(cpu);
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ mutex_unlock(&watchdog_proc_mutex);
}
static void update_watchdog_all_cpus(void)
{
- int cpu;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- update_watchdog(cpu);
- put_online_cpus();
+ watchdog_park_threads();
+ watchdog_unpark_threads();
}
static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else {
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask))
- pr_err("Failed to set cpumask for watchdog threads\n");
+ else
watchdog_running = 1;
- }
} else {
/*
* Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
/*
* If the parameter is being read return the state of the corresponding
* bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
old = ACCESS_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
int err;
mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write) {
/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
pr_err("cpumask update failed\n");
}
}
+out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
- if (!cpumask_empty(tick_nohz_full_mask))
- pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
- tick_nohz_full_mask);
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
diff --git a/lib/Kconfig b/lib/Kconfig
index 5b1a994cafe1..f86a69c5c0e5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4d1fb075ebb6..f7dd8f1d4075 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1682,6 +1682,9 @@ config TEST_HEXDUMP
config TEST_STRING_HELPERS
tristate "Test functions located in the string_helpers module at runtime"
+config TEST_PARSE_INTEGER
+ tristate "Test parse_integer() function at runtime"
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index d105682e0310..51e1d761f0b9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -32,9 +32,11 @@ obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o
obj-y += kstrtox.o
+obj-y += parse-integer.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+obj-$(CONFIG_TEST_PARSE_INTEGER) += test-parse-integer.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LKM) += test_module.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
@@ -81,6 +83,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/bitmap.c b/lib/bitmap.c
index a578a0189199..814814397cce 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
nchunks = nbits = totaldigits = c = 0;
do {
- chunk = ndigits = 0;
+ chunk = 0;
+ ndigits = totaldigits;
/* Get the next chunk of the bitmap */
while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
return -EOVERFLOW;
chunk = (chunk << 4) | hex_to_bin(c);
- ndigits++; totaldigits++;
+ totaldigits++;
}
- if (ndigits == 0)
+ if (ndigits == totaldigits)
return -EINVAL;
if (nchunks == 0 && chunk == 0)
continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
int nmaskbits)
{
unsigned a, b;
- int c, old_c, totaldigits;
+ int c, old_c, totaldigits, ndigits;
const char __user __force *ubuf = (const char __user __force *)buf;
int at_start, in_range;
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
at_start = 1;
in_range = 0;
a = b = 0;
+ ndigits = totaldigits;
/* Get the next cpu# or a range of cpu#'s */
while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
if (isspace(c))
continue;
- /*
- * If the last character was a space and the current
- * character isn't '\0', we've got embedded whitespace.
- * This is a no-no, so throw an error.
- */
- if (totaldigits && c && isspace(old_c))
- return -EINVAL;
-
/* A '\0' or a ',' signal the end of a cpu# or range */
if (c == '\0' || c == ',')
break;
+ /*
+ * whitespaces between digits are not allowed,
+ * but it's ok if whitespaces are on head or tail.
+ * when old_c is whilespace,
+ * if totaldigits == ndigits, whitespace is on head.
+ * if whitespace is on tail, it should not run here.
+ * as c was ',' or '\0',
+ * the last code line has broken the current loop.
+ */
+ if ((totaldigits != ndigits) && isspace(old_c))
+ return -EINVAL;
if (c == '-') {
if (at_start || in_range)
return -EINVAL;
b = 0;
in_range = 1;
+ at_start = 1;
continue;
}
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
at_start = 0;
totaldigits++;
}
+ if (ndigits == totaldigits)
+ continue;
+ /* if no digit is after '-', it's wrong*/
+ if (at_start && in_range)
+ return -EINVAL;
if (!(a <= b))
return -EINVAL;
if (b >= nmaskbits)
return -ERANGE;
- if (!at_start) {
- while (a <= b) {
- set_bit(a, maskp);
- a++;
- }
+ while (a <= b) {
+ set_bit(a, maskp);
+ a++;
}
} while (buflen && c == ',');
return 0;
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 8f13cf73c2ec..c248c5894557 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -27,7 +27,7 @@ static int get_range(char **str, int *pint)
int x, inc_counter, upper_range;
(*str)++;
- upper_range = simple_strtol((*str), NULL, 0);
+ parse_integer(*str, 0, &upper_range);
inc_counter = upper_range - *pint;
for (x = *pint; x < upper_range; x++)
*pint++ = x;
@@ -51,13 +51,14 @@ static int get_range(char **str, int *pint)
int get_option(char **str, int *pint)
{
- char *cur = *str;
+ int len;
- if (!cur || !(*cur))
+ if (!str || !*str)
return 0;
- *pint = simple_strtol(cur, str, 0);
- if (cur == *str)
+ len = parse_integer(*str, 0, pint);
+ if (len < 0)
return 0;
+ *str += len;
if (**str == ',') {
(*str)++;
return 2;
@@ -126,38 +127,41 @@ EXPORT_SYMBOL(get_options);
unsigned long long memparse(const char *ptr, char **retptr)
{
- char *endptr; /* local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
+ unsigned long long val = 0;
+ int len;
+
+ len = parse_integer(ptr, 0, &val);
+ if (len < 0)
+ goto out;
+ ptr += len;
+ switch (*ptr) {
case 'E':
case 'e':
- ret <<= 10;
+ val <<= 10;
case 'P':
case 'p':
- ret <<= 10;
+ val <<= 10;
case 'T':
case 't':
- ret <<= 10;
+ val <<= 10;
case 'G':
case 'g':
- ret <<= 10;
+ val <<= 10;
case 'M':
case 'm':
- ret <<= 10;
+ val <<= 10;
case 'K':
case 'k':
- ret <<= 10;
- endptr++;
+ val <<= 10;
+ ptr++;
default:
break;
}
-
+out:
if (retptr)
- *retptr = endptr;
+ *retptr = (char *)ptr;
- return ret;
+ return val;
}
EXPORT_SYMBOL(memparse);
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 6dd0335ea61b..0234361b24b8 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -743,12 +743,12 @@ exit_0:
}
#ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long len,
+STATIC int INIT __decompress(unsigned char *buf, long len,
long (*fill)(void*, unsigned long),
long (*flush)(void*, unsigned long),
- unsigned char *outbuf,
+ unsigned char *outbuf, long olen,
long *pos,
- void(*error)(char *x))
+ void (*error)(char *x))
{
return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
}
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index d4c7891635ec..555c06bf20da 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -1,4 +1,5 @@
#ifdef STATIC
+#define PREBOOT
/* Pre-boot environment: included */
/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
@@ -33,23 +34,23 @@ static long INIT nofill(void *buffer, unsigned long len)
}
/* Included from initramfs et al code */
-STATIC int INIT gunzip(unsigned char *buf, long len,
+STATIC int INIT __gunzip(unsigned char *buf, long len,
long (*fill)(void*, unsigned long),
long (*flush)(void*, unsigned long),
- unsigned char *out_buf,
+ unsigned char *out_buf, long out_len,
long *pos,
void(*error)(char *x)) {
u8 *zbuf;
struct z_stream_s *strm;
int rc;
- size_t out_len;
rc = -1;
if (flush) {
out_len = 0x8000; /* 32 K */
out_buf = malloc(out_len);
} else {
- out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
+ if (!out_len)
+ out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
}
if (!out_buf) {
error("Out of memory while allocating output buffer");
@@ -181,4 +182,24 @@ gunzip_nomem1:
return rc; /* returns Z_OK (0) if successful */
}
-#define decompress gunzip
+#ifndef PREBOOT
+STATIC int INIT gunzip(unsigned char *buf, long len,
+ long (*fill)(void*, unsigned long),
+ long (*flush)(void*, unsigned long),
+ unsigned char *out_buf,
+ long *pos,
+ void (*error)(char *x))
+{
+ return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
+}
+#else
+STATIC int INIT __decompress(unsigned char *buf, long len,
+ long (*fill)(void*, unsigned long),
+ long (*flush)(void*, unsigned long),
+ unsigned char *out_buf, long out_len,
+ long *pos,
+ void (*error)(char *x))
+{
+ return __gunzip(buf, len, fill, flush, out_buf, out_len, pos, error);
+}
+#endif
diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c
index 40f66ebe57b7..036fc882cd72 100644
--- a/lib/decompress_unlz4.c
+++ b/lib/decompress_unlz4.c
@@ -196,12 +196,12 @@ exit_0:
}
#ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
long (*fill)(void*, unsigned long),
long (*flush)(void*, unsigned long),
- unsigned char *output,
+ unsigned char *output, long out_len,
long *posp,
- void(*error)(char *x)
+ void (*error)(char *x)
)
{
return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
index 0be83af62b88..ed7a1fd819f2 100644
--- a/lib/decompress_unlzma.c
+++ b/lib/decompress_unlzma.c
@@ -620,7 +620,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, long in_len,
num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
- if (p == 0)
+ if (p == NULL)
goto exit_2;
num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
for (i = 0; i < num_probs; i++)
@@ -667,13 +667,12 @@ exit_0:
}
#ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
long (*fill)(void*, unsigned long),
long (*flush)(void*, unsigned long),
- unsigned char *output,
+ unsigned char *output, long out_len,
long *posp,
- void(*error)(char *x)
- )
+ void (*error)(char *x))
{
return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
}
diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
index b94a31bdd87d..f4c158e3a022 100644
--- a/lib/decompress_unlzo.c
+++ b/lib/decompress_unlzo.c
@@ -31,6 +31,7 @@
*/
#ifdef STATIC
+#define PREBOOT
#include "lzo/lzo1x_decompress_safe.c"
#else
#include <linux/decompress/unlzo.h>
@@ -287,4 +288,14 @@ exit:
return ret;
}
-#define decompress unlzo
+#ifdef PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+ long (*fill)(void*, unsigned long),
+ long (*flush)(void*, unsigned long),
+ unsigned char *out_buf, long olen,
+ long *pos,
+ void (*error)(char *x))
+{
+ return unlzo(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index b07a78340e9d..25d59a95bd66 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -394,4 +394,14 @@ error_alloc_state:
* This macro is used by architecture-specific files to decompress
* the kernel image.
*/
-#define decompress unxz
+#ifdef XZ_PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+ long (*fill)(void*, unsigned long),
+ long (*flush)(void*, unsigned long),
+ unsigned char *out_buf, long olen,
+ long *pos,
+ void (*error)(char *x))
+{
+ return unxz(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
pool->min_alloc_order = min_alloc_order;
pool->algo = gen_pool_first_fit;
pool->data = NULL;
+ pool->name = NULL;
}
return pool;
}
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
kfree(chunk);
}
+ kfree_const(pool->name);
kfree(pool);
- return;
}
EXPORT_SYMBOL(gen_pool_destroy);
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res)
gen_pool_destroy(*(struct gen_pool **)res);
}
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+ struct gen_pool **p = res;
+
+ /* NULL data matches only a pool without an assigned name */
+ if (!data && !(*p)->name)
+ return 1;
+
+ if (!data || !(*p)->name)
+ return 0;
+
+ return !strcmp((*p)->name, data);
+}
+
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+ struct gen_pool **p;
+
+ p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+ (void *)name);
+ if (!p)
+ return NULL;
+ return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
+
/**
* devm_gen_pool_create - managed gen_pool_create
* @dev: device that provides the gen_pool
* @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
*
* Create a new special memory pool that can be used to manage special purpose
* memory not managed by the regular kmalloc/kfree interface. The pool will be
* automatically destroyed by the device management code.
*/
struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
- int nid)
+ int nid, const char *name)
{
struct gen_pool **ptr, *pool;
+ const char *pool_name = NULL;
+
+ /* Check that genpool to be created is uniquely addressed on device */
+ if (gen_pool_get(dev, name))
+ return ERR_PTR(-EINVAL);
+
+ if (name) {
+ pool_name = kstrdup_const(name, GFP_KERNEL);
+ if (!pool_name)
+ return ERR_PTR(-ENOMEM);
+ }
ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
if (!ptr)
- return NULL;
+ goto free_pool_name;
pool = gen_pool_create(min_alloc_order, nid);
- if (pool) {
- *ptr = pool;
- devres_add(dev, ptr);
- } else {
- devres_free(ptr);
- }
+ if (!pool)
+ goto free_devres;
+
+ *ptr = pool;
+ pool->name = pool_name;
+ devres_add(dev, ptr);
return pool;
-}
-EXPORT_SYMBOL(devm_gen_pool_create);
-/**
- * gen_pool_get - Obtain the gen_pool (if any) for a device
- * @dev: device to retrieve the gen_pool from
- *
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
- struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
- NULL);
+free_devres:
+ devres_free(ptr);
+free_pool_name:
+ kfree_const(pool_name);
- if (!p)
- return NULL;
- return *p;
+ return ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL_GPL(gen_pool_get);
+EXPORT_SYMBOL(devm_gen_pool_create);
#ifdef CONFIG_OF
/**
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
const char *propname, int index)
{
struct platform_device *pdev;
- struct device_node *np_pool;
+ struct device_node *np_pool, *parent;
+ const char *name = NULL;
+ struct gen_pool *pool = NULL;
np_pool = of_parse_phandle(np, propname, index);
if (!np_pool)
return NULL;
+
pdev = of_find_device_by_node(np_pool);
+ if (!pdev) {
+ /* Check if named gen_pool is created by parent node device */
+ parent = of_get_parent(np_pool);
+ pdev = of_find_device_by_node(parent);
+ of_node_put(parent);
+
+ of_property_read_string(np_pool, "label", &name);
+ if (!name)
+ name = np_pool->name;
+ }
+ if (pdev)
+ pool = gen_pool_get(&pdev->dev, name);
of_node_put(np_pool);
- if (!pdev)
- return NULL;
- return gen_pool_get(&pdev->dev);
+
+ return pool;
}
EXPORT_SYMBOL_GPL(of_gen_pool_get);
#endif /* CONFIG_OF */
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index ec8da78df9be..1698b286d954 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -20,22 +20,6 @@
#include <asm/uaccess.h>
#include "kstrtox.h"
-const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
-{
- if (*base == 0) {
- if (s[0] == '0') {
- if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
- *base = 16;
- else
- *base = 8;
- } else
- *base = 10;
- }
- if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
- s += 2;
- return s;
-}
-
/*
* Convert non-negative integer string representation in explicitly given radix
* to an integer.
@@ -83,244 +67,6 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
return rv;
}
-static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
-{
- unsigned long long _res;
- unsigned int rv;
-
- s = _parse_integer_fixup_radix(s, &base);
- rv = _parse_integer(s, base, &_res);
- if (rv & KSTRTOX_OVERFLOW)
- return -ERANGE;
- if (rv == 0)
- return -EINVAL;
- s += rv;
- if (*s == '\n')
- s++;
- if (*s)
- return -EINVAL;
- *res = _res;
- return 0;
-}
-
-/**
- * kstrtoull - convert a string to an unsigned long long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
-{
- if (s[0] == '+')
- s++;
- return _kstrtoull(s, base, res);
-}
-EXPORT_SYMBOL(kstrtoull);
-
-/**
- * kstrtoll - convert a string to a long long
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoll(const char *s, unsigned int base, long long *res)
-{
- unsigned long long tmp;
- int rv;
-
- if (s[0] == '-') {
- rv = _kstrtoull(s + 1, base, &tmp);
- if (rv < 0)
- return rv;
- if ((long long)(-tmp) >= 0)
- return -ERANGE;
- *res = -tmp;
- } else {
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if ((long long)tmp < 0)
- return -ERANGE;
- *res = tmp;
- }
- return 0;
-}
-EXPORT_SYMBOL(kstrtoll);
-
-/* Internal, do not use. */
-int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(unsigned long)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(_kstrtoul);
-
-/* Internal, do not use. */
-int _kstrtol(const char *s, unsigned int base, long *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(long)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(_kstrtol);
-
-/**
- * kstrtouint - convert a string to an unsigned int
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtouint(const char *s, unsigned int base, unsigned int *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(unsigned int)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtouint);
-
-/**
- * kstrtoint - convert a string to an int
- * @s: The start of the string. The string must be null-terminated, and may also
- * include a single newline before its terminating null. The first character
- * may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- * given as 0, then the base of the string is automatically detected with the
- * conventional semantics - If it begins with 0x the number will be parsed as a
- * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- * parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Used as a replacement for the obsolete simple_strtoull. Return code must
- * be checked.
- */
-int kstrtoint(const char *s, unsigned int base, int *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(int)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtoint);
-
-int kstrtou16(const char *s, unsigned int base, u16 *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(u16)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtou16);
-
-int kstrtos16(const char *s, unsigned int base, s16 *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(s16)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtos16);
-
-int kstrtou8(const char *s, unsigned int base, u8 *res)
-{
- unsigned long long tmp;
- int rv;
-
- rv = kstrtoull(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (unsigned long long)(u8)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtou8);
-
-int kstrtos8(const char *s, unsigned int base, s8 *res)
-{
- long long tmp;
- int rv;
-
- rv = kstrtoll(s, base, &tmp);
- if (rv < 0)
- return rv;
- if (tmp != (long long)(s8)tmp)
- return -ERANGE;
- *res = tmp;
- return 0;
-}
-EXPORT_SYMBOL(kstrtos8);
-
#define kstrto_from_user(f, g, type) \
int f(const char __user *s, size_t count, unsigned int base, type *res) \
{ \
diff --git a/lib/kstrtox.h b/lib/kstrtox.h
index f13eeeaf441d..7b1f447cbcc1 100644
--- a/lib/kstrtox.h
+++ b/lib/kstrtox.h
@@ -2,7 +2,6 @@
#define _LIB_KSTRTOX_H
#define KSTRTOX_OVERFLOW (1U << 31)
-const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res);
#endif
diff --git a/lib/parse-integer.c b/lib/parse-integer.c
new file mode 100644
index 000000000000..7c7f48bec328
--- /dev/null
+++ b/lib/parse-integer.c
@@ -0,0 +1,222 @@
+/*
+ * See parse_integer().
+ *
+ * Individual dispatch functions in this file aren't supposed to be used
+ * directly and thus aren't advertised and documented despited being exported.
+ *
+ * Do not use any function in this file for any reason.
+ */
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/parse-integer.h>
+#include <asm/bug.h>
+
+const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+ if (*base == 0) {
+ if (s[0] == '0') {
+ if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+ *base = 16;
+ else
+ *base = 8;
+ } else
+ *base = 10;
+ }
+ if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+ s += 2;
+ BUG_ON(*base < 2 || *base > 16);
+ return s;
+}
+
+static int ___parse_integer(const char *s, unsigned int base, unsigned long long *val)
+{
+ const char *s0 = s, *sd;
+ unsigned long long acc;
+
+ s = sd = _parse_integer_fixup_radix(s0, &base);
+ acc = 0;
+ while (*s) {
+ unsigned int d;
+
+ if ('0' <= *s && *s <= '9')
+ d = *s - '0';
+ else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')
+ d = _tolower(*s) - 'a' + 10;
+ else
+ break;
+ if (d >= base)
+ break;
+ /* Overflow can't happen early enough. */
+ if ((acc >> 60) && acc > div_u64(ULLONG_MAX - d, base))
+ return -ERANGE;
+ acc = acc * base + d;
+ s++;
+ }
+ /* At least one digit has to be converted. */
+ if (s == sd)
+ return -EINVAL;
+ *val = acc;
+ /* Radix 1 is not supported otherwise returned length can overflow. */
+ return s - s0;
+}
+
+static int __parse_integer(const char *s, unsigned int base, unsigned long long *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = ___parse_integer(s, base & ~PARSE_INTEGER_NEWLINE, &tmp);
+ if (rv < 0)
+ return rv;
+ if (base & PARSE_INTEGER_NEWLINE) {
+ /* Accept "integer\0" or "integer\n\0" */
+ s += rv;
+ if (*s == '\n')
+ s++;
+ if (*s)
+ return -EINVAL;
+ }
+ *val = tmp;
+ return rv;
+}
+
+int _parse_integer_ull(const char *s, unsigned int base, unsigned long long *val)
+{
+ char sign;
+ int rv;
+
+ sign = 0;
+ if (*s == '-')
+ return -EINVAL;
+ else if (*s == '+')
+ sign = *s++;
+
+ rv = __parse_integer(s, base, val);
+ if (rv < 0)
+ return rv;
+ if (base & PARSE_INTEGER_NEWLINE)
+ return 0;
+ return rv + !!sign;
+}
+EXPORT_SYMBOL(_parse_integer_ull);
+
+int _parse_integer_ll(const char *s, unsigned int base, long long *val)
+{
+ unsigned long long tmp;
+ char sign;
+ int rv;
+
+ sign = 0;
+ if (*s == '-' || *s == '+')
+ sign = *s++;
+
+ rv = __parse_integer(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (sign == '-') {
+ if ((long long)-tmp > 0)
+ return -ERANGE;
+ *val = -tmp;
+ } else {
+ if ((long long)tmp < 0)
+ return -ERANGE;
+ *val = tmp;
+ }
+ if (base & PARSE_INTEGER_NEWLINE)
+ return 0;
+ return rv + !!sign;
+}
+EXPORT_SYMBOL(_parse_integer_ll);
+
+int _parse_integer_u(const char *s, unsigned int base, unsigned int *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned int)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_u);
+
+int _parse_integer_i(const char *s, unsigned int base, int *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (int)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_i);
+
+int _parse_integer_us(const char *s, unsigned int base, unsigned short *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned short)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_us);
+
+int _parse_integer_s(const char *s, unsigned int base, short *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (short)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_s);
+
+int _parse_integer_uc(const char *s, unsigned int base, unsigned char *val)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = _parse_integer_ull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned char)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_uc);
+
+int _parse_integer_sc(const char *s, unsigned int base, signed char *val)
+{
+ long long tmp;
+ int rv;
+
+ rv = _parse_integer_ll(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (signed char)tmp)
+ return -ERANGE;
+ *val = tmp;
+ return rv;
+}
+EXPORT_SYMBOL(_parse_integer_sc);
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b..f00386777278 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -44,7 +44,7 @@ static int match_one(char *s, const char *p, substring_t args[])
p = meta + 1;
if (isdigit(*p))
- len = simple_strtoul(p, (char **) &p, 10);
+ p += parse_integer(p, 10, (unsigned int *)&len);
else if (*p == '%') {
if (*s++ != '%')
return 0;
@@ -57,6 +57,11 @@ static int match_one(char *s, const char *p, substring_t args[])
args[argc].from = s;
switch (*p++) {
+ union {
+ int i;
+ unsigned int u;
+ } u;
+
case 's': {
size_t str_len = strlen(s);
@@ -68,19 +73,20 @@ static int match_one(char *s, const char *p, substring_t args[])
break;
}
case 'd':
- simple_strtol(s, &args[argc].to, 0);
+ len = parse_integer(s, 0, &u.i);
goto num;
case 'u':
- simple_strtoul(s, &args[argc].to, 0);
+ len = parse_integer(s, 0, &u.u);
goto num;
case 'o':
- simple_strtoul(s, &args[argc].to, 8);
+ len = parse_integer(s, 8, &u.u);
goto num;
case 'x':
- simple_strtoul(s, &args[argc].to, 16);
+ len = parse_integer(s, 16, &u.u);
num:
- if (args[argc].to == args[argc].from)
+ if (len < 0)
return 0;
+ args[argc].to = args[argc].from + len;
break;
default:
return 0;
@@ -127,10 +133,8 @@ EXPORT_SYMBOL(match_token);
*/
static int match_number(substring_t *s, int *result, int base)
{
- char *endp;
char *buf;
int ret;
- long val;
size_t len = s->to - s->from;
buf = kmalloc(len + 1, GFP_KERNEL);
@@ -139,16 +143,11 @@ static int match_number(substring_t *s, int *result, int base)
memcpy(buf, s->from, len);
buf[len] = '\0';
- ret = 0;
- val = simple_strtol(buf, &endp, base);
- if (endp == buf)
- ret = -EINVAL;
- else if (val < (long)INT_MIN || val > (long)INT_MAX)
- ret = -ERANGE;
- else
- *result = (int) val;
+ ret = parse_integer(buf, base, result);
kfree(buf);
- return ret;
+ if (ret < 0)
+ return ret;
+ return 0;
}
/**
diff --git a/lib/show_mem.c b/lib/show_mem.c
index adc98e1825ba..1feed6a2b12a 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -38,11 +38,9 @@ void show_mem(unsigned int filter)
printk("%lu pages RAM\n", total);
printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMA
- printk("%lu pages reserved\n", (reserved - totalcma_pages));
printk("%lu pages cma reserved\n", totalcma_pages);
-#else
- printk("%lu pages reserved\n", reserved);
#endif
#ifdef CONFIG_QUICKLIST
printk("%lu pages in pagetable cache\n",
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index c98ae818eb4e..54036ce2e2dd 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -410,7 +410,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
* @dst: destination buffer (escaped)
* @osz: destination buffer size
* @flags: combination of the flags (bitwise OR):
- * %ESCAPE_SPACE:
+ * %ESCAPE_SPACE: (special white space, not space itself)
* '\f' - form feed
* '\n' - new line
* '\r' - carriage return
@@ -432,16 +432,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
* all previous together
* %ESCAPE_HEX:
* '\xHH' - byte with hexadecimal value HH (2 digits)
- * @esc: NULL-terminated string of characters any of which, if found in
- * the source, has to be escaped
+ * @only: NULL-terminated string containing characters used to limit
+ * the selected escape class. If characters are included in @only
+ * that would not normally be escaped by the classes selected
+ * in @flags, they will be copied to @dst unescaped.
*
* Description:
* The process of escaping byte buffer includes several parts. They are applied
* in the following sequence.
* 1. The character is matched to the printable class, if asked, and in
* case of match it passes through to the output.
- * 2. The character is not matched to the one from @esc string and thus
- * must go as is to the output.
+ * 2. The character is not matched to the one from @only string and thus
+ * must go as-is to the output.
* 3. The character is checked if it falls into the class given by @flags.
* %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
* character. Note that they actually can't go together, otherwise
@@ -458,11 +460,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
* dst for a '\0' terminator if and only if ret < osz.
*/
int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
- unsigned int flags, const char *esc)
+ unsigned int flags, const char *only)
{
char *p = dst;
char *end = p + osz;
- bool is_dict = esc && *esc;
+ bool is_dict = only && *only;
while (isz--) {
unsigned char c = *src++;
@@ -471,7 +473,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
* Apply rules in the following sequence:
* - the character is printable, when @flags has
* %ESCAPE_NP bit set
- * - the @esc string is supplied and does not contain a
+ * - the @only string is supplied and does not contain a
* character under question
* - the character doesn't fall into a class of symbols
* defined by given @flags
@@ -479,7 +481,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
* output buffer.
*/
if ((flags & ESCAPE_NP && isprint(c)) ||
- (is_dict && !strchr(esc, c))) {
+ (is_dict && !strchr(only, c))) {
/* do nothing */
} else {
if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 76f29ecba8f4..caabc7151b90 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -100,7 +100,7 @@ static int __init
setup_io_tlb_npages(char *str)
{
if (isdigit(*str)) {
- io_tlb_nslabs = simple_strtoul(str, &str, 0);
+ str += parse_integer(str, 0, &io_tlb_nslabs);
/* avoid tail segment of size < IO_TLB_SEGSIZE */
io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
}
diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c
index 4137bca5f8e8..f355f67169b6 100644
--- a/lib/test-kstrtox.c
+++ b/lib/test-kstrtox.c
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
{"4294967297", 10, 4294967297LL},
{"9223372036854775807", 10, 9223372036854775807LL},
+ {"-0", 10, 0LL},
{"-1", 10, -1LL},
{"-2", 10, -2LL},
{"-9223372036854775808", 10, LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
{"-9223372036854775809", 10},
{"-18446744073709551614", 10},
{"-18446744073709551615", 10},
- /* negative zero isn't an integer in Linux */
- {"-0", 0},
- {"-0", 8},
- {"-0", 10},
- {"-0", 16},
/* sign is first character if any */
{"-+1", 0},
{"-+1", 8},
diff --git a/lib/test-parse-integer.c b/lib/test-parse-integer.c
new file mode 100644
index 000000000000..4274603f4d1a
--- /dev/null
+++ b/lib/test-parse-integer.c
@@ -0,0 +1,563 @@
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/parse-integer.h>
+#include <asm/bug.h>
+
+#define for_each_test(i, test) \
+ for (i = 0; i < ARRAY_SIZE(test); i++)
+
+#define DEFINE_TEST_OK(type, test_type, test) \
+test_type { \
+ const char *str; \
+ unsigned int base; \
+ int expected_rv; \
+ type expected_val; \
+}; \
+static const test_type test[] __initconst =
+
+#define TEST_OK(type, fmt, test) \
+{ \
+ unsigned int i; \
+ \
+ for_each_test(i, test) { \
+ const typeof(test[0]) *t = &test[i]; \
+ type val; \
+ int rv; \
+ \
+ rv = parse_integer(t->str, t->base, &val); \
+ if (rv != t->expected_rv || val != t->expected_val) { \
+ WARN(1, "str '%s', base %u, expected %d/"fmt", got %d/"fmt"\n", \
+ t->str, t->base, t->expected_rv, t->expected_val, rv, val); \
+ } \
+ } \
+}
+
+struct test_fail {
+ const char *str;
+ unsigned int base;
+};
+
+#define DEFINE_TEST_FAIL(type, test) \
+static const struct test_fail test[] __initconst =
+
+#define TEST_FAIL(type, fmt, test) \
+{ \
+ unsigned int i; \
+ \
+ for_each_test(i, test) { \
+ const typeof(test[0]) *t = &test[i]; \
+ type val; \
+ int rv; \
+ \
+ val = 113; \
+ rv = parse_integer(t->str, t->base, &val); \
+ if (rv >= 0 || val != 113) { \
+ WARN(1, "str '%s', base %u, expected -E, got %d/"fmt"\n",\
+ t->str, t->base, rv, val); \
+ } \
+ } \
+}
+
+DEFINE_TEST_OK(unsigned long long, struct test_ull, test_ull_ok)
+{
+ {"0", 10, 1, 0},
+ {"1", 10, 1, 1},
+ {"2", 10, 1, 2},
+ {"3", 10, 1, 3},
+ {"4", 10, 1, 4},
+ {"5", 10, 1, 5},
+ {"6", 10, 1, 6},
+ {"7", 10, 1, 7},
+ {"8", 10, 1, 8},
+ {"9", 10, 1, 9},
+
+ {"0", 8, 1, 0},
+ {"1", 8, 1, 1},
+ {"2", 8, 1, 2},
+ {"3", 8, 1, 3},
+ {"4", 8, 1, 4},
+ {"5", 8, 1, 5},
+ {"6", 8, 1, 6},
+ {"7", 8, 1, 7},
+
+ {"0", 16, 1, 0},
+ {"1", 16, 1, 1},
+ {"2", 16, 1, 2},
+ {"3", 16, 1, 3},
+ {"4", 16, 1, 4},
+ {"5", 16, 1, 5},
+ {"6", 16, 1, 6},
+ {"7", 16, 1, 7},
+ {"8", 16, 1, 8},
+ {"9", 16, 1, 9},
+ {"a", 16, 1, 10},
+ {"b", 16, 1, 11},
+ {"c", 16, 1, 12},
+ {"d", 16, 1, 13},
+ {"e", 16, 1, 14},
+ {"f", 16, 1, 15},
+ {"A", 16, 1, 10},
+ {"B", 16, 1, 11},
+ {"C", 16, 1, 12},
+ {"D", 16, 1, 13},
+ {"E", 16, 1, 14},
+ {"F", 16, 1, 15},
+
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+ {"65536", 10, 5, 65536},
+ {"2147483647", 10, 10, 2147483647},
+ {"2147483648", 10, 10, 2147483648ull},
+ {"4294967295", 10, 10, 4294967295ull},
+ {"4294967296", 10, 10, 4294967296},
+ {"9223372036854775807", 10, 19, 9223372036854775807},
+ {"9223372036854775808", 10, 19, 9223372036854775808ull},
+ {"18446744073709551615", 10, 20, 18446744073709551615ull},
+
+ {"177", 8, 3, 0177},
+ {"200", 8, 3, 0200},
+ {"377", 8, 3, 0377},
+ {"400", 8, 3, 0400},
+ {"77777", 8, 5, 077777},
+ {"100000", 8, 6, 0100000},
+ {"177777", 8, 6, 0177777},
+ {"200000", 8, 6, 0200000},
+ {"17777777777", 8, 11, 017777777777},
+ {"20000000000", 8, 11, 020000000000},
+ {"37777777777", 8, 11, 037777777777},
+ {"40000000000", 8, 11, 040000000000},
+ {"777777777777777777777", 8, 21, 0777777777777777777777},
+ {"1000000000000000000000", 8, 22, 01000000000000000000000},
+ {"1777777777777777777777", 8, 22, 01777777777777777777777},
+
+ {"7f", 16, 2, 0x7f},
+ {"80", 16, 2, 0x80},
+ {"ff", 16, 2, 0xff},
+ {"100", 16, 3, 0x100},
+ {"7fff", 16, 4, 0x7fff},
+ {"8000", 16, 4, 0x8000},
+ {"ffff", 16, 4, 0xffff},
+ {"10000", 16, 5, 0x10000},
+ {"7fffffff", 16, 8, 0x7fffffff},
+ {"80000000", 16, 8, 0x80000000},
+ {"ffffffff", 16, 8, 0xffffffff},
+ {"100000000", 16, 9, 0x100000000},
+ {"7fffffffffffffff", 16, 16, 0x7fffffffffffffff},
+ {"8000000000000000", 16, 16, 0x8000000000000000},
+ {"ffffffffffffffff", 16, 16, 0xffffffffffffffff},
+ /* test sign */
+ {"+0", 10, 2, 0},
+ {"+42", 10, 3, 42},
+ /* test termination */
+ {"42/", 10, 2, 42},
+ {"42:", 10, 2, 42},
+ {"42/", 8, 2, 042},
+ {"428", 8, 2, 042},
+ {"42/", 16, 2, 0x42},
+ {"42`", 16, 2, 0x42},
+ {"42g", 16, 2, 0x42},
+ {"42@", 16, 2, 0x42},
+ {"42G", 16, 2, 0x42},
+ /* base autodetection */
+ {"010", 0, 3, 8},
+ {"0x10", 0, 4, 16},
+ {"0X10", 0, 4, 16},
+};
+
+static void __init test_parse_integer_ull_ok(void)
+{
+ TEST_OK(unsigned long long, "%llu", test_ull_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned long long, test_ull_fail)
+{
+ /* type overflow */
+ {"10000000000000000000000000000000000000000000000000000000000000000", 2},
+ {"18446744073709551616", 10},
+ {"2000000000000000000000", 8},
+ {"10000000000000000", 16},
+
+ {"", 0},
+ {"", 10},
+ {"", 8},
+ {"", 16},
+ {"+", 0},
+ {"+", 10},
+ {"+", 8},
+ {"+", 16},
+ {"-", 0},
+ {"-", 10},
+ {"-", 8},
+ {"-", 16},
+ {" ", 0},
+ {" ", 10},
+ {" ", 8},
+ {" ", 16},
+ {"\n", 0},
+ {"\n", 10},
+ {"\n", 8},
+ {"\n", 16},
+ {" 0", 0},
+ {" 0", 10},
+ {" 0", 8},
+ {" 0", 16},
+ {"\n0", 0},
+ {"\n0", 10},
+ {"\n0", 8},
+ {"\n0", 16},
+ /* non-digit */
+ {"/", 10},
+ {":", 10},
+ {"/", 8},
+ {"8", 8},
+ {"/", 16},
+ {":", 16},
+ {"`", 16},
+ {"g", 16},
+ {"@", 16},
+ {"G", 16},
+ {"/0", 10},
+ {":0", 10},
+ {"/0", 8},
+ {"80", 8},
+ {"/0", 16},
+ {":0", 16},
+ {"`0", 16},
+ {"g0", 16},
+ {"@0", 16},
+ {"G0", 16},
+
+ {"-0", 0},
+ {"-0", 10},
+ {"-0", 8},
+ {"-0", 16},
+ {"-1", 0},
+ {"-1", 10},
+ {"-1", 8},
+ {"-1", 16},
+ /* accept only one sign */
+ {"--", 0},
+ {"--", 10},
+ {"--", 8},
+ {"--", 16},
+ {"-+", 0},
+ {"-+", 10},
+ {"-+", 8},
+ {"-+", 16},
+ {"+-", 0},
+ {"+-", 10},
+ {"+-", 8},
+ {"+-", 16},
+ {"++", 0},
+ {"++", 10},
+ {"++", 8},
+ {"++", 16},
+ {"--0", 0},
+ {"--0", 10},
+ {"--0", 8},
+ {"--0", 16},
+ {"-+0", 0},
+ {"-+0", 10},
+ {"-+0", 8},
+ {"-+0", 16},
+ {"+-0", 0},
+ {"+-0", 10},
+ {"+-0", 8},
+ {"+-0", 16},
+ {"++0", 0},
+ {"++0", 10},
+ {"++0", 8},
+ {"++0", 16},
+};
+
+static void __init test_parse_integer_ull_fail(void)
+{
+ TEST_FAIL(unsigned long long, "%llu", test_ull_fail);
+}
+
+DEFINE_TEST_OK(long long, struct test_ll, test_ll_ok)
+{
+ {"-9223372036854775808",10, 20, LLONG_MIN},
+ {"-4294967296", 10, 11, -4294967296},
+ {"-2147483648", 10, 11, -2147483648ll},
+ {"-65536", 10, 6, -65536},
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+ {"65535", 10, 5, 65535},
+ {"2147483647", 10, 10, 2147483647},
+ {"4294967295", 10, 10, 4294967295ll},
+ {"9223372036854775807", 10, 19, 9223372036854775807},
+};
+
+static void __init test_parse_integer_ll_ok(void)
+{
+ TEST_OK(long long, "%lld", test_ll_ok);
+}
+
+DEFINE_TEST_FAIL(long long, test_ll_fail)
+{
+ {"-9223372036854775809", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_ll_fail(void)
+{
+ TEST_FAIL(long long, "%lld", test_ll_fail);
+}
+
+DEFINE_TEST_OK(unsigned int, struct test_u, test_u_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+ {"65536", 10, 5, 65536},
+ {"2147483647", 10, 10, 2147483647},
+ {"2147483648", 10, 10, 2147483648u},
+ {"4294967295", 10, 10, 4294967295u},
+};
+
+static void __init test_parse_integer_u_ok(void)
+{
+ TEST_OK(unsigned int, "%u", test_u_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned int, test_u_fail)
+{
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_u_fail(void)
+{
+ TEST_FAIL(unsigned int, "%u", test_u_fail);
+}
+
+DEFINE_TEST_OK(int, struct test_i, test_i_ok)
+{
+ {"-2147483648", 10, 11, INT_MIN},
+ {"-65536", 10, 6, -65536},
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+ {"65535", 10, 5, 65535},
+ {"2147483647", 10, 10, 2147483647},
+};
+
+static void __init test_parse_integer_i_ok(void)
+{
+ TEST_OK(int, "%d", test_i_ok);
+}
+
+DEFINE_TEST_FAIL(int, test_i_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_i_fail(void)
+{
+ TEST_FAIL(int, "%d", test_i_fail);
+}
+
+DEFINE_TEST_OK(unsigned short, struct test_us, test_us_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+ {"256", 10, 3, 256},
+ {"32767", 10, 5, 32767},
+ {"32768", 10, 5, 32768},
+ {"65535", 10, 5, 65535},
+};
+
+static void __init test_parse_integer_us_ok(void)
+{
+ TEST_OK(unsigned short, "%hu", test_us_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned short, test_us_fail)
+{
+ {"65536", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_us_fail(void)
+{
+ TEST_FAIL(unsigned short, "%hu", test_us_fail);
+}
+
+DEFINE_TEST_OK(short, struct test_s, test_s_ok)
+{
+ {"-32768", 10, 6, -32768},
+ {"-256", 10, 4, -256},
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"255", 10, 3, 255},
+ {"32767", 10, 5, 32767},
+};
+
+static void __init test_parse_integer_s_ok(void)
+{
+ TEST_OK(short, "%hd", test_s_ok);
+}
+
+DEFINE_TEST_FAIL(short, test_s_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"-2147483648", 10},
+ {"-65536", 10},
+ {"-32769", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_s_fail(void)
+{
+ TEST_FAIL(short, "%hd", test_s_fail);
+}
+
+DEFINE_TEST_OK(unsigned char, struct test_uc, test_uc_ok)
+{
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+ {"128", 10, 3, 128},
+ {"255", 10, 3, 255},
+};
+
+static void __init test_parse_integer_uc_ok(void)
+{
+ TEST_OK(unsigned char, "%hhu", test_uc_ok);
+}
+
+DEFINE_TEST_FAIL(unsigned char, test_uc_fail)
+{
+ {"256", 10},
+ {"32767", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"65536", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"4294967296", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+ {"18446744073709551615", 10},
+};
+
+static void __init test_parse_integer_uc_fail(void)
+{
+ TEST_FAIL(unsigned char, "%hhu", test_uc_fail);
+}
+
+DEFINE_TEST_OK(signed char, struct test_sc, test_sc_ok)
+{
+ {"-128", 10, 4, -128},
+ {"-0", 10, 2, 0},
+ {"0", 10, 1, 0},
+ {"127", 10, 3, 127},
+};
+
+static void __init test_parse_integer_sc_ok(void)
+{
+ TEST_OK(signed char, "%hhd", test_sc_ok);
+}
+
+DEFINE_TEST_FAIL(signed char, test_sc_fail)
+{
+ {"-9223372036854775809", 10},
+ {"-9223372036854775808", 10},
+ {"-4294967296", 10},
+ {"-2147483649", 10},
+ {"-2147483648", 10},
+ {"-65536", 10},
+ {"-32769", 10},
+ {"-32768", 10},
+ {"-256", 10},
+ {"-129", 10},
+ {"128", 10},
+ {"255", 10},
+ {"32767", 10},
+ {"32768", 10},
+ {"65535", 10},
+ {"2147483647", 10},
+ {"2147483648", 10},
+ {"4294967295", 10},
+ {"9223372036854775807", 10},
+ {"9223372036854775808", 10},
+};
+
+static void __init test_parse_integer_sc_fail(void)
+{
+ TEST_FAIL(signed char, "%hhd", test_sc_fail);
+}
+
+static int __init test_parse_integer_init(void)
+{
+ test_parse_integer_ull_ok();
+ test_parse_integer_ull_fail();
+ test_parse_integer_ll_ok();
+ test_parse_integer_ll_fail();
+ test_parse_integer_u_ok();
+ test_parse_integer_u_fail();
+ test_parse_integer_i_ok();
+ test_parse_integer_i_fail();
+ test_parse_integer_us_ok();
+ test_parse_integer_us_fail();
+ test_parse_integer_s_ok();
+ test_parse_integer_s_fail();
+ test_parse_integer_uc_ok();
+ test_parse_integer_uc_fail();
+ test_parse_integer_sc_ok();
+ test_parse_integer_sc_fail();
+ return -EINVAL;
+}
+module_init(test_parse_integer_init);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 95cd63b43b99..7f0cdd2e609f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1361,6 +1361,21 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
}
}
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1448,6 +1463,7 @@ int kptr_restrict __read_mostly;
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1459,7 +1475,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1598,6 +1614,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
@@ -2471,8 +2489,6 @@ EXPORT_SYMBOL_GPL(bprintf);
int vsscanf(const char *buf, const char *fmt, va_list args)
{
const char *str = buf;
- char *next;
- char digit;
int num = 0;
u8 qualifier;
unsigned int base;
@@ -2484,6 +2500,8 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
bool is_sign;
while (*fmt) {
+ int len;
+
/* skip any white space in format */
/* white space in format matchs any amount of
* white space, including none, in the input.
@@ -2612,81 +2630,88 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
*/
str = skip_spaces(str);
- digit = *str;
- if (is_sign && digit == '-')
- digit = *(str + 1);
-
- if (!digit
- || (base == 16 && !isxdigit(digit))
- || (base == 10 && !isdigit(digit))
- || (base == 8 && (!isdigit(digit) || digit > '7'))
- || (base == 0 && !isdigit(digit)))
- break;
-
if (is_sign)
- val.s = qualifier != 'L' ?
- simple_strtol(str, &next, base) :
- simple_strtoll(str, &next, base);
+ len = parse_integer(str, base, &val.s);
else
- val.u = qualifier != 'L' ?
- simple_strtoul(str, &next, base) :
- simple_strtoull(str, &next, base);
+ len = parse_integer(str, base, &val.u);
+ if (len < 0)
+ break;
- if (field_width > 0 && next - str > field_width) {
+ if (field_width > 0) {
if (base == 0)
_parse_integer_fixup_radix(str, &base);
- while (next - str > field_width) {
+ while (len > field_width) {
if (is_sign)
val.s = div_s64(val.s, base);
else
val.u = div_u64(val.u, base);
- --next;
+ len--;
}
}
switch (qualifier) {
case 'H': /* that's 'hh' in format */
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (signed char)val.s)
+ goto out;
*va_arg(args, signed char *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned char)val.u)
+ goto out;
*va_arg(args, unsigned char *) = val.u;
+ }
break;
case 'h':
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (short)val.s)
+ goto out;
*va_arg(args, short *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned short)val.u)
+ goto out;
*va_arg(args, unsigned short *) = val.u;
+ }
break;
case 'l':
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (long)val.s)
+ goto out;
*va_arg(args, long *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned long)val.u)
+ goto out;
*va_arg(args, unsigned long *) = val.u;
+ }
break;
case 'L':
- if (is_sign)
+ if (is_sign) {
*va_arg(args, long long *) = val.s;
- else
+ } else {
*va_arg(args, unsigned long long *) = val.u;
+ }
break;
case 'Z':
case 'z':
+ if (val.u != (size_t)val.u)
+ goto out;
*va_arg(args, size_t *) = val.u;
break;
default:
- if (is_sign)
+ if (is_sign) {
+ if (val.s != (int)val.s)
+ goto out;
*va_arg(args, int *) = val.s;
- else
+ } else {
+ if (val.u != (unsigned int)val.u)
+ goto out;
*va_arg(args, unsigned int *) = val.u;
+ }
break;
}
num++;
-
- if (!next)
- break;
- str = next;
+ str += len;
}
-
+out:
return num;
}
EXPORT_SYMBOL(vsscanf);
diff --git a/lib/zlib_deflate/deftree.c b/lib/zlib_deflate/deftree.c
index ddf348299f24..9b1756b12743 100644
--- a/lib/zlib_deflate/deftree.c
+++ b/lib/zlib_deflate/deftree.c
@@ -35,6 +35,7 @@
/* #include "deflate.h" */
#include <linux/zutil.h>
+#include <linux/bitrev.h>
#include "defutil.h"
#ifdef DEBUG_ZLIB
@@ -146,7 +147,6 @@ static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
static void compress_block (deflate_state *s, ct_data *ltree,
ct_data *dtree);
static void set_data_type (deflate_state *s);
-static unsigned bi_reverse (unsigned value, int length);
static void bi_windup (deflate_state *s);
static void bi_flush (deflate_state *s);
static void copy_block (deflate_state *s, char *buf, unsigned len,
@@ -284,7 +284,7 @@ static void tr_static_init(void)
/* The static distance tree is trivial: */
for (n = 0; n < D_CODES; n++) {
static_dtree[n].Len = 5;
- static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+ static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
}
static_init_done = 1;
}
@@ -520,7 +520,7 @@ static void gen_codes(
int len = tree[n].Len;
if (len == 0) continue;
/* Now reverse the bits */
- tree[n].Code = bi_reverse(next_code[len]++, len);
+ tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
diff --git a/lib/zlib_deflate/defutil.h b/lib/zlib_deflate/defutil.h
index b640b6402e99..a8c370897c9f 100644
--- a/lib/zlib_deflate/defutil.h
+++ b/lib/zlib_deflate/defutil.h
@@ -293,22 +293,6 @@ void zlib_tr_stored_type_only (deflate_state *);
}
/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-static inline unsigned bi_reverse(unsigned code, /* the value to invert */
- int len) /* its bit length */
-{
- register unsigned res = 0;
- do {
- res |= code & 1;
- code >>= 1, res <<= 1;
- } while (--len > 0);
- return res >> 1;
-}
-
-/* ===========================================================================
* Flush the bit buffer, keeping at most 7 bits in it.
*/
static inline void bi_flush(deflate_state *s)
diff --git a/mm/Kconfig b/mm/Kconfig
index 0fb2e96653fe..1163c4634be3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -651,3 +651,15 @@ config DEFERRED_STRUCT_PAGE_INIT
config FRAME_VECTOR
bool
+
+config IDLE_PAGE_TRACKING
+ bool "Enable idle page tracking"
+ depends on SYSFS && MMU
+ select PAGE_EXTENSION if !64BIT
+ help
+ This feature allows to estimate the amount of user pages that have
+ not been touched during a given period of time. This information can
+ be useful to tune memory cgroup limits and/or for job placement
+ within a compute cluster.
+
+ See Documentation/vm/idle_page_tracking.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index be5d5c866305..395742943d59 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,3 +79,5 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a23dd1934654..3b6380784c28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
count += pages;
while (pages--)
__free_pages_bootmem(page++, cur++, 0);
+ bdata->node_bootmem_map = NULL;
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
@@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
sidx + bdata->node_min_pfn,
eidx + bdata->node_min_pfn);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return;
+
if (bdata->hint_idx > sidx)
bdata->hint_idx = sidx;
@@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
eidx + bdata->node_min_pfn,
flags);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return 0;
+
for (idx = sidx; idx < eidx; idx++)
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
if (exclusive) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2..8f64d3533990 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
return !get_pageblock_skip(page);
}
+static void reset_cached_positions(struct zone *zone)
+{
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
- zone->compact_cached_migrate_pfn[0] = start_pfn;
- zone->compact_cached_migrate_pfn[1] = start_pfn;
- zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
clear_pageblock_skip(page);
}
+
+ reset_cached_positions(zone);
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!valid_page)
valid_page = page;
+
+ /*
+ * For compound pages such as THP and hugetlbfs, we can save
+ * potentially a lot of iterations if we skip them at once.
+ * The check is racy, but we can consider only valid values
+ * and the only danger is skipping too much.
+ */
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER)) {
+ blockpfn += (1UL << comp_order) - 1;
+ cursor += (1UL << comp_order) - 1;
+ }
+
+ goto isolate_fail;
+ }
+
if (!PageBuddy(page))
goto isolate_fail;
@@ -490,6 +514,13 @@ isolate_fail:
}
+ /*
+ * There is a tiny chance that we have read bogus compound_order(),
+ * so be careful to not go outside of the pageblock.
+ */
+ if (unlikely(blockpfn > end_pfn))
+ blockpfn = end_pfn;
+
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
nr_scanned, total_isolated);
@@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ bool is_lru;
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
*/
- if (!PageLRU(page)) {
+ is_lru = PageLRU(page);
+ if (!is_lru) {
if (unlikely(balloon_page_movable(page))) {
if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
}
- continue;
}
/*
- * PageLRU is set. lru_lock normally excludes isolation
- * splitting and collapsing (collapsing has already happened
- * if PageLRU is set) but the lock is not necessarily taken
- * here and it is wasteful to take it just to check transhuge.
- * Check TransHuge without lock and skip the whole pageblock if
- * it's either a transhuge or hugetlbfs page, as calling
- * compound_order() without preventing THP from splitting the
- * page underneath us may return surprising results.
+ * Regardless of being on LRU, compound pages such as THP and
+ * hugetlbfs are not to be compacted. We can potentially save
+ * a lot of iterations if we skip them at once. The check is
+ * racy, but we can consider only valid values and the only
+ * danger is skipping too much.
*/
- if (PageTransHuge(page)) {
- if (!locked)
- low_pfn = ALIGN(low_pfn + 1,
- pageblock_nr_pages) - 1;
- else
- low_pfn += (1 << compound_order(page)) - 1;
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER))
+ low_pfn += (1UL << comp_order) - 1;
continue;
}
+ if (!is_lru)
+ continue;
+
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!locked)
break;
- /* Recheck PageLRU and PageTransHuge under lock */
+ /* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
+
+ /*
+ * Page become compound since the non-locked check,
+ * and it's on LRU. It can only be a THP so the order
+ * is safe to read and it's 0 for tail pages.
+ */
+ if (unlikely(PageCompound(page))) {
+ low_pfn += (1UL << compound_order(page)) - 1;
continue;
}
}
@@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page)
}
/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+ return (cc->free_pfn >> pageblock_order)
+ <= (cc->migrate_pfn >> pageblock_order);
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn &&
- cc->nr_migratepages > cc->nr_freepages;
+ for (; block_start_pfn >= low_pfn;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn, freelist, false);
/*
+ * If we isolated enough freepages, or aborted due to async
+ * compaction being contended, terminate the loop.
* Remember where the free scanner should restart next time,
* which is where isolate_freepages_block() left off.
* But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
* In that case we will however want to restart at the start
* of the previous pageblock.
*/
- cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
- isolate_start_pfn :
- block_start_pfn - pageblock_nr_pages;
-
- /*
- * isolate_freepages_block() might have aborted due to async
- * compaction being contended
- */
- if (cc->contended)
+ if ((cc->nr_freepages >= cc->nr_migratepages)
+ || cc->contended) {
+ if (isolate_start_pfn >= block_end_pfn)
+ isolate_start_pfn =
+ block_start_pfn - pageblock_nr_pages;
break;
+ } else {
+ /*
+ * isolate_freepages_block() should not terminate
+ * prematurely unless contended, or isolated enough
+ */
+ VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+ }
}
/* split_free_page does not map the pages */
map_pages(freelist);
/*
- * If we crossed the migrate scanner, we want to keep it that way
- * so that compact_finished() may detect this
+ * Record where the free scanner will restart next time. Either we
+ * broke from the loop and set isolate_start_pfn based on the last
+ * call to isolate_freepages_block(), or we met the migration scanner
+ * and the loop terminated due to isolate_start_pfn < low_pfn
*/
- if (block_start_pfn < low_pfn)
- cc->free_pfn = cc->migrate_pfn;
+ cc->free_pfn = isolate_start_pfn;
}
/*
@@ -1127,12 +1180,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
acct_isolated(zone, cc);
- /*
- * Record where migration scanner will be restarted. If we end up in
- * the same pageblock as the free scanner, make the scanners fully
- * meet so that compact_finished() terminates compaction.
- */
- cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
+ /* Record where migration scanner will be restarted. */
+ cc->migrate_pfn = low_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1147,11 +1196,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
- if (cc->free_pfn <= cc->migrate_pfn) {
+ if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
- zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ reset_cached_positions(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1376,7 +1423,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
- if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+ if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_PARTIAL;
goto out;
}
diff --git a/mm/debug.c b/mm/debug.c
index 76089ddf99ea..e784110fb51d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{1UL << PG_compound_lock, "compound_lock" },
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+ {1UL << PG_young, "young" },
+ {1UL << PG_idle, "idle" },
+#endif
};
static void dump_flags(unsigned long flags,
@@ -121,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
+ {VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fd5fe4342e93..71a8998cd03a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
return page;
}
-static inline int is_page_busy(struct dma_page *page)
+static inline bool is_page_busy(struct dma_page *page)
{
return page->in_use != 0;
}
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
{
bool empty = false;
+ if (unlikely(!pool))
+ return;
+
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
@@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
spin_unlock_irqrestore(&pool->lock, flags);
- page = pool_alloc_page(pool, mem_flags);
+ page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
if (!page)
return NULL;
@@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
break;
}
}
- memset(retval, POOL_POISON_ALLOCATED, pool->size);
+ if (!(mem_flags & __GFP_ZERO))
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (mem_flags & __GFP_ZERO)
+ memset(retval, 0, pool->size);
+
return retval;
}
EXPORT_SYMBOL(dma_pool_alloc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 0cfadafb3fb0..73c15d73ec32 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -217,6 +217,28 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
return (__force void *)__early_ioremap(phys_addr, size,
FIXMAP_PAGE_NORMAL);
}
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+ unsigned long slop, clen;
+ char *p;
+
+ while (size) {
+ slop = src & ~PAGE_MASK;
+ clen = size;
+ if (clen > MAX_MAP_CHUNK - slop)
+ clen = MAX_MAP_CHUNK - slop;
+ p = early_memremap(src & PAGE_MASK, clen + slop);
+ memcpy(dest, p + slop, clen);
+ early_memunmap(p, clen + slop);
+ dest += clen;
+ src += clen;
+ size -= clen;
+ }
+}
+
#ifdef FIXMAP_PAGE_RO
void __init *
early_memremap_ro(resource_size_t phys_addr, unsigned long size)
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc825458..204fd1c7c813 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -641,11 +641,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -768,6 +768,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -832,18 +833,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -2473,21 +2476,6 @@ ssize_t generic_perform_write(struct file *file,
iov_iter_count(i));
again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
-
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
@@ -2495,8 +2483,17 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
-
+ /*
+ * 'page' is now locked. If we are trying to copy from a
+ * mapping of 'page' in userspace, the copy might fault and
+ * would need PageUptodate() to complete. But, page can not be
+ * made Uptodate without acquiring the page lock, which we hold.
+ * Deadlock. Avoid with pagefault_disable(). Fix up below with
+ * iov_iter_fault_in_readable().
+ */
+ pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2516,14 @@ again:
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
+ /*
+ * This is the fallback to recover if the copy from
+ * userspace above faults.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
goto again;
}
pos += copied;
diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..deafa2c91b36 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
#include <linux/sched.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+
#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
#include "internal.h"
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
return NULL;
}
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, unsigned int flags)
+{
+ /* No page to get reference */
+ if (flags & FOLL_GET)
+ return -EFAULT;
+
+ if (flags & FOLL_TOUCH) {
+ pte_t entry = *pte;
+
+ if (flags & FOLL_WRITE)
+ entry = pte_mkdirty(entry);
+ entry = pte_mkyoung(entry);
+
+ if (!pte_same(*pte, entry)) {
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ update_mmu_cache(vma, address, pte);
+ }
+ }
+
+ /* Proper page table entry exists, but no corresponding struct page */
+ return -EEXIST;
+}
+
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
@@ -73,10 +99,21 @@ retry:
page = vm_normal_page(vma, address, pte);
if (unlikely(!page)) {
- if ((flags & FOLL_DUMP) ||
- !is_zero_pfn(pte_pfn(pte)))
- goto bad_page;
- page = pte_page(pte);
+ if (flags & FOLL_DUMP) {
+ /* Avoid special (like zero) pages in core dumps */
+ page = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte))) {
+ page = pte_page(pte);
+ } else {
+ int ret;
+
+ ret = follow_pfn_pte(vma, address, ptep, flags);
+ page = ERR_PTR(ret);
+ goto out;
+ }
}
if (flags & FOLL_GET)
@@ -92,7 +129,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -114,12 +151,9 @@ retry:
unlock_page(page);
}
}
+out:
pte_unmap_unlock(ptep, ptl);
return page;
-bad_page:
- pte_unmap_unlock(ptep, ptl);
- return ERR_PTR(-EFAULT);
-
no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
@@ -265,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
+ /* mlock all present pages, but do not fault in new pages */
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+ return -ENOENT;
/* For mm_populate(), just skip the stack guard page. */
if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
@@ -489,9 +526,15 @@ retry:
goto next_page;
}
BUG();
- }
- if (IS_ERR(page))
+ } else if (PTR_ERR(page) == -EEXIST) {
+ /*
+ * Proper page table entry exists, but no corresponding
+ * struct page.
+ */
+ goto next_page;
+ } else if (IS_ERR(page)) {
return i ? i : PTR_ERR(page);
+ }
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
@@ -850,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ gup_flags &= ~FOLL_POPULATE;
+
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 097c7a4bfbd9..7109330c5911 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -23,6 +24,8 @@
#include <linux/pagemap.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -104,7 +107,7 @@ static struct khugepaged_scan khugepaged_scan = {
};
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
{
struct zone *zone;
int nr_zones = 0;
@@ -139,7 +142,6 @@ static int set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
setup_per_zone_wmarks();
- return 0;
}
static int start_stop_khugepaged(void)
@@ -171,12 +173,7 @@ fail:
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
@@ -716,21 +713,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
- struct page *page, gfp_t gfp)
+ unsigned long address, pmd_t *pmd,
+ struct page *page, gfp_t gfp,
+ unsigned int flags)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
spinlock_t *ptl;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
- return VM_FAULT_OOM;
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
return VM_FAULT_OOM;
}
@@ -750,6 +753,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
+
+ /* Deliver the page fault to userland */
+ if (userfaultfd_missing(vma)) {
+ int ret;
+
+ spin_unlock(ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
+ pte_free(mm, pgtable);
+ ret = handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
+ }
+
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +778,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes);
spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
}
return 0;
@@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ int ret;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ ret = 0;
+ set = false;
+ if (pmd_none(*pmd)) {
+ if (userfaultfd_missing(vma)) {
+ spin_unlock(ptl);
+ ret = handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ } else {
+ set_huge_zero_page(pgtable, mm, vma,
+ haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ set = true;
+ }
+ } else
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
- return 0;
+ return ret;
}
gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +865,51 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
+ return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
+ flags);
+}
+
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t entry;
+ spinlock_t *ptl;
+
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_none(*pmd)) {
+ entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
}
+ spin_unlock(ptl);
+}
- count_vm_event(THP_FAULT_ALLOC);
- return 0;
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * If we had pmd_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return VM_FAULT_SIGBUS;
+ insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+ return VM_FAULT_NOPAGE;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +944,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_huge_zero_pmd(pmd)) {
struct page *zero_page;
- bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
zero_page = get_huge_zero_page();
- set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
- BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
}
@@ -1238,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -1384,46 +1453,76 @@ out:
return 0;
}
-int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
+
{
spinlock_t *ptl;
- int ret = 0;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
- pgtable_t pgtable;
pmd_t orig_pmd;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_huge_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pmdp related
- * operations.
- */
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
- if (is_huge_zero_pmd(orig_pmd)) {
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
- put_huge_zero_page();
- } else {
- page = pmd_page(orig_pmd);
- page_remove_rmap(page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(!PageHead(page), page);
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
- tlb_remove_page(tlb, page);
- }
- pte_free(tlb->mm, pgtable);
- ret = 1;
+ spin_unlock(ptl);
+ ret = 0;
}
+
return ret;
}
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+{
+ pmd_t orig_pmd;
+ spinlock_t *ptl;
+
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (vma_is_dax(vma)) {
+ spin_unlock(ptl);
+ if (is_huge_zero_pmd(orig_pmd))
+ put_huge_zero_page();
+ } else if (is_huge_zero_pmd(orig_pmd)) {
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ put_huge_zero_page();
+ } else {
+ struct page *page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ tlb_remove_page(tlb, page);
+ }
+ return 1;
+}
+
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -1599,6 +1698,11 @@ unlock:
return NULL;
}
+int pmd_freeable(pmd_t pmd)
+{
+ return !pmd_dirty(pmd);
+}
+
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
@@ -1689,6 +1793,11 @@ static void __split_huge_page_refcount(struct page *page,
/* clear PageTail before overwriting first_page */
smp_wmb();
+ if (page_is_young(page))
+ set_page_young(page_tail);
+ if (page_is_idle(page))
+ set_page_idle(page_tail);
+
/*
* __split_huge_page_splitting() already set the
* splitting bit in all pmd that could map this
@@ -1705,7 +1814,7 @@ static void __split_huge_page_refcount(struct page *page,
*/
page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
+ BUG_ON(page_tail->mapping != TAIL_MAPPING);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
@@ -2133,7 +2242,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2193,7 +2303,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageLRU(page), page);
/* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval) || PageReferenced(page) ||
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
@@ -2586,7 +2697,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
@@ -2619,7 +2731,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
*/
if (page_count(page) != 1 + !!PageSwapCache(page))
goto out_unmap;
- if (pte_young(pteval) || PageReferenced(page) ||
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
@@ -2882,7 +2995,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2895,25 +3008,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
- }
- if (is_huge_zero_pmd(*pmd)) {
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto unlock;
+ if (vma_is_dax(vma)) {
+ pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ if (is_huge_zero_pmd(_pmd))
+ put_huge_zero_page();
+ } else if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
+ } else {
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ get_page(page);
}
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ unlock:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- split_huge_page(page);
+ if (!page)
+ return;
+ split_huge_page(page);
put_page(page);
/*
@@ -2962,7 +3077,7 @@ static void split_huge_page_address(struct mm_struct *mm,
split_huge_page_pmd_mm(mm, address, pmd);
}
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c3087089d8..586aa69df900 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the
- * specified range. We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range. If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map. In the normal case, existing regions will be expanded
+ * to accommodate the specified range. Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range. However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded. In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
if (f <= rg->to)
break;
+ /*
+ * If no region exists which can be expanded to include the
+ * specified range, the list must have been modified by an
+ * interleving call to region_del(). Pull a region descriptor
+ * from the cache and use it for this range.
+ */
+ if (&rg->link == head || t < rg->from) {
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region,
+ link);
+ list_del(&nrg->link);
+
+ nrg->from = f;
+ nrg->to = t;
+ list_add(&nrg->link, rg->link.prev);
+
+ add += t - f;
+ goto out_locked;
+ }
+
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+out_locked:
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
* so that the subsequent region_add call will have all the
* regions it needs and will not fail.
*
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero. -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map. If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t). This number is greater or equal to
+ * zero. -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
retry:
spin_lock(&resv->lock);
+retry_locked:
+ resv->adds_in_progress++;
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations.
+ */
+ if (resv->adds_in_progress > resv->region_cache_count) {
+ struct file_region *trg;
+
+ VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+ /* Must drop lock to allocate a new descriptor. */
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ return -ENOMEM;
+
+ spin_lock(&resv->lock);
+ list_add(&trg->link, &resv->region_cache);
+ resv->region_cache_count++;
+ goto retry_locked;
+ }
+
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
* size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) {
if (!nrg) {
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
}
/*
- * Truncate the reserve map at index 'end'. Modify/truncate any
- * region which contains end. Delete any regions past end.
- * Return the number of huge pages removed from the map.
+ * Abort the in progress add operation. The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add. Operations are sometimes
+ * aborted after the call to region_chg. In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine. They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
+ */
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+ spin_lock(&resv->lock);
+ VM_BUG_ON(!resv->region_cache_count);
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map. If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted. Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more. In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated. If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM. Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
*/
-static long region_truncate(struct resv_map *resv, long end)
+static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
- long chg = 0;
+ struct file_region *nrg = NULL;
+ long del = 0;
+retry:
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
+ list_for_each_entry_safe(rg, trg, head, link) {
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
break;
- if (&rg->link == head)
- goto out;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
- }
+ if (f > rg->from && t < rg->to) { /* Must split region */
+ /*
+ * Check for an entry in the cache before dropping
+ * lock and attempting allocation.
+ */
+ if (!nrg &&
+ resv->region_cache_count > resv->adds_in_progress) {
+ nrg = list_first_entry(&resv->region_cache,
+ struct file_region,
+ link);
+ list_del(&nrg->link);
+ resv->region_cache_count--;
+ }
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ del += t - f;
+
+ /* New entry for end of split region */
+ nrg->from = t;
+ nrg->to = rg->to;
+ INIT_LIST_HEAD(&nrg->link);
+
+ /* Original entry is trimmed */
+ rg->to = f;
+
+ list_add(&nrg->link, &rg->link);
+ nrg = NULL;
break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
+ }
+
+ if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+ del += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ continue;
+ }
+
+ if (f <= rg->from) { /* Trim beginning of region */
+ del += t - rg->from;
+ rg->from = t;
+ } else { /* Trim end of region */
+ del += rg->to - f;
+ rg->to = f;
+ }
}
-out:
spin_unlock(&resv->lock);
- return chg;
+ kfree(nrg);
+ return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page. The huge page itself was free'ed
+ * and removed from the page cache. This routine will adjust the subpool
+ * usage count, and the global reserve count if needed. By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ if (restore_reserve && rsv_adjust) {
+ struct hstate *h = hstate_inode(inode);
+
+ hugetlb_acct_memory(h, 1);
+ }
}
/*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
- if (!resv_map)
+ struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+ if (!resv_map || !rg) {
+ kfree(resv_map);
+ kfree(rg);
return NULL;
+ }
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ resv_map->adds_in_progress = 0;
+
+ INIT_LIST_HEAD(&resv_map->region_cache);
+ list_add(&rg->link, &resv_map->region_cache);
+ resv_map->region_cache_count = 1;
+
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+ struct list_head *head = &resv_map->region_cache;
+ struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
- region_truncate(resv_map, 0);
+ region_del(resv_map, 0, LONG_MAX);
+
+ /* ... and any entries left in the cache */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+ VM_BUG_ON(resv_map->adds_in_progress);
+
kfree(resv_map);
}
@@ -616,7 +782,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
if (vma->vm_flags & VM_NORESERVE) {
/*
@@ -629,23 +795,34 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
* properly, so add work-around here.
*/
if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return 1;
+ return true;
else
- return 0;
+ return false;
}
/* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE)
- return 1;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * We know VM_NORESERVE is not set. Therefore, there SHOULD
+ * be a region map for all pages. The only situation where
+ * there is no region map is if a hole was punched via
+ * fallocate. In this case, there really are no reverves to
+ * use. This situation is indicated if chg != 0.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
/*
* Only the process that called mmap() has reserves for
* private mappings.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return 1;
+ return true;
- return 0;
+ return false;
}
static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
}
}
+
/*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map. If the page allocation fails,
+ * the reservation must be ended instead of committed. vma_end_reservation
+ * is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*/
+enum vma_resv_mode {
+ VMA_NEEDS_RESV,
+ VMA_COMMIT_RESV,
+ VMA_END_RESV,
+};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
- bool commit)
+ enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- if (commit)
- ret = region_add(resv, idx, idx + 1);
- else
+ switch (mode) {
+ case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1);
+ break;
+ case VMA_COMMIT_RESV:
+ ret = region_add(resv, idx, idx + 1);
+ break;
+ case VMA_END_RESV:
+ region_abort(resv, idx, idx + 1);
+ ret = 0;
+ break;
+ default:
+ BUG();
+ }
if (vma->vm_flags & VM_MAYSHARE)
return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, false);
+ return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, true);
+ return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+static void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
+}
+
+struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg, commit;
+ long map_chg, map_commit;
+ long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no
- * reserves and will not have accounted against subpool
- * limit. Check that the subpool limit can be made before
- * satisfying the allocation MAP_NORESERVE mappings may also
- * need pages and subpool limit allocated allocated if no reserve
- * mapping overlaps.
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
*/
- chg = vma_needs_reservation(h, vma, addr);
- if (chg < 0)
+ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+ if (map_chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves as indicated by the region/reserve map. Check
+ * that the allocation will not exceed the subpool limit.
+ * Allocations for MAP_NORESERVE mappings also need to be
+ * checked against any subpool limit.
+ */
+ if (map_chg || avoid_reserve) {
+ gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ if (gbl_chg < 0) {
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
+ }
+
+ /*
+ * Even though there was no reservation in the region/reserve
+ * map, there could be reservations associated with the
+ * subpool that can be used. This would be indicated if the
+ * return value of hugepage_subpool_get_pages() is zero.
+ * However, if avoid_reserve is specified we still avoid even
+ * the subpool reservations.
+ */
+ if (avoid_reserve)
+ gbl_chg = 1;
+ }
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ /*
+ * glb_chg is passed to indicate whether or not a page must be taken
+ * from the global free pool (global change). gbl_chg == 0 indicates
+ * a reservation exists for the allocation.
+ */
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(chg > commit)) {
+ map_commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_subpool_put:
- if (chg || avoid_reserve)
+ if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
}
kobject_put(hugepages_kobj);
- kfree(htlb_fault_mutex_table);
+ kfree(hugetlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
#else
num_fault_mutexes = 1;
#endif
- htlb_fault_mutex_table =
+ hugetlb_fault_mutex_table =
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
- BUG_ON(!htlb_fault_mutex_table);
+ BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
- mutex_init(&htlb_fault_mutex_table[i]);
+ mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -2562,6 +2790,30 @@ void hugetlb_show_meminfo(void)
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+ int i;
+ unsigned long total_usage = 0;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++) {
+ total_usage += atomic_long_read(&mm->hugetlb_usage.count[i]) *
+ (huge_page_size(&hstates[i]) >> 10);
+ }
+
+ seq_printf(m, "HugetlbPages:\t%8lu kB (", total_usage);
+ for (i = 0; i < HUGE_MAX_HSTATE; i++) {
+ if (huge_page_order(&hstates[i]) == 0)
+ break;
+ if (i > 0)
+ seq_puts(m, " ");
+
+ seq_printf(m, "%ld*%lukB",
+ atomic_long_read(&mm->hugetlb_usage.count[i]),
+ huge_page_size(&hstates[i]) >> 10);
+ }
+ seq_puts(m, ")\n");
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
@@ -2797,6 +3049,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
get_page(ptepage);
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
+ inc_hugetlb_count(dst, h);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -2877,6 +3130,7 @@ again:
if (huge_pte_dirty(pte))
set_page_dirty(page);
+ dec_hugetlb_count(mm, h);
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
@@ -3147,6 +3401,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx)
+{
+ struct inode *inode = mapping->host;
+ struct hstate *h = hstate_inode(inode);
+ int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+ if (err)
+ return err;
+ ClearPagePrivate(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3465,13 @@ retry:
set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
- int err;
- struct inode *inode = mapping->host;
-
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
- ClearPagePrivate(page);
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += blocks_per_huge_page(h);
- spin_unlock(&inode->i_lock);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3499,14 @@ retry:
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
+ }
ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
@@ -3261,6 +3527,7 @@ retry:
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
+ inc_hugetlb_count(mm, h);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -3280,7 +3547,7 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3305,7 +3572,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3353,8 +3620,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&htlb_fault_mutex_table[hash]);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
@@ -3387,6 +3654,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM;
goto out_mutex;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3706,7 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
- mutex_unlock(&htlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3995,15 @@ int hugetlb_reserve_pages(struct inode *inode,
}
return 0;
out_err:
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
}
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +4011,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
- if (resv_map)
- chg = region_truncate(resv_map, offset);
+ if (resv_map) {
+ chg = region_del(resv_map, start, end);
+ /*
+ * region_del() can fail in the rare case where a region
+ * must be split and another region descriptor can not be
+ * allocated. If end == LONG_MAX, it will not fail.
+ */
+ if (chg < 0)
+ return chg;
+ }
+
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3751,6 +4032,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
+
+ return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3764,8 +4047,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+ unsigned long vm_flags = vma->vm_flags & ~(VM_LOCKED|VM_LOCKONFAULT);
+ unsigned long svm_flags = svma->vm_flags & ~(VM_LOCKED|VM_LOCKONFAULT);
/*
* match the virtual addresses, permission and the alignment of the
@@ -3779,7 +4062,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
return saddr;
}
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
@@ -3789,8 +4072,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
+ return true;
+ return false;
}
/*
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac17dad4..9d26fd9fefe4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
/*
* do a racy check with elevated page count, to make sure PG_hwpoison
* will only be set for the targeted owner (or on a free page).
- * We temporarily take page lock for try_get_mem_cgroup_from_page().
* memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(hpage);
err = hwpoison_filter(hpage);
- unlock_page(hpage);
if (err)
goto put_out;
@@ -58,7 +55,7 @@ inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
put_out:
- put_page(p);
+ put_hwpoison_page(p);
return 0;
}
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
if (!dentry)
goto fail;
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
hwpoison_dir, &hwpoison_filter_memcg);
if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..1195dd2d6a2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f110157c..b034c620957f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
struct kmemleak_object *object)
{
const u8 *ptr = (const u8 *)object->pointer;
- int i, len, remaining;
- unsigned char linebuf[HEX_ROW_SIZE * 5];
+ size_t len;
/* limit the number of lines to HEX_MAX_LINES */
- remaining = len =
- min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
-
- seq_printf(seq, " hex dump (first %d bytes):\n", len);
- for (i = 0; i < len; i += HEX_ROW_SIZE) {
- int linelen = min(remaining, HEX_ROW_SIZE);
-
- remaining -= HEX_ROW_SIZE;
- hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
- HEX_ASCII);
- seq_printf(seq, " %s\n", linebuf);
- }
+ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+ seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101eaacdf..bc7be0ee2080 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..fa6479aca0c9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,14 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+
+struct madvise_free_private {
+ struct vm_area_struct *vma;
+ struct mmu_gather *tlb;
+};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -103,7 +112,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
@@ -255,6 +265,164 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct madvise_free_private *fp = walk->private;
+ struct mmu_gather *tlb = fp->tlb;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = fp->vma;
+ spinlock_t *ptl;
+ pte_t *pte, ptent;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long next;
+ int nr_swap = 0;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageSwapCache(page)) {
+ if (!trylock_page(page))
+ continue;
+
+ if (!try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+next:
+ cond_resched();
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_free_private fp = {
+ .vma = vma,
+ .tlb = tlb,
+ };
+
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = &fp,
+ };
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (vma->vm_file)
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -300,7 +468,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+ if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
f = vma->vm_file;
@@ -378,6 +546,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -385,7 +561,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
}
-static int
+static bool
madvise_behavior_valid(int behavior)
{
switch (behavior) {
@@ -397,6 +573,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -407,10 +584,10 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 87108e77e476..69babe22eef7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
break;
}
- return (i < type->cnt) ? i : -1;
+ return i < type->cnt;
}
/*
@@ -566,6 +566,10 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
+ WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
@@ -611,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.memory;
+ struct memblock_type *type = &memblock.memory;
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -831,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
phys_addr_t *out_start,
phys_addr_t *out_end)
{
- struct memblock_type *rsv = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
- if (*idx >= 0 && *idx < rsv->cnt) {
- struct memblock_region *r = &rsv->regions[*idx];
+ if (*idx >= 0 && *idx < type->cnt) {
+ struct memblock_region *r = &type->regions[*idx];
phys_addr_t base = r->base;
phys_addr_t size = r->size;
@@ -1562,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
* Check if the region [@base, @base+@size) intersects a reserved memory block.
*
* RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
*/
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ return memblock_overlaps_region(&memblock.reserved, base, size);
}
void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..aacc767b23df 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
"unevictable",
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
- MEM_CGROUP_NTARGETS,
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
-struct mem_cgroup_stat_cpu {
- long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEMCG_NR_EVENTS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
- struct mem_cgroup *position;
- /* scan generation, increased every round-trip */
- unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
- struct lruvec lruvec;
- unsigned long lru_size[NR_LRU_LISTS];
-
- struct reclaim_iter iter[DEF_PRIORITY + 1];
-
- struct rb_node tree_node; /* RB tree node */
- unsigned long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
- struct mem_cgroup *memcg; /* Back pointer, we cannot */
- /* use container_of */
-};
-
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
- struct eventfd_ctx *eventfd;
- unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below or equal to usage. */
- int current_threshold;
- /* Size of entries[] */
- unsigned int size;
- /* Array of thresholds */
- struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
- /* Primary thresholds array */
- struct mem_cgroup_threshold_ary *primary;
- /*
- * Spare threshold array.
- * This is needed to make mem_cgroup_unregister_event() "never fail".
- * It must be able to store at least primary->size - 1 entries.
- */
- struct mem_cgroup_threshold_ary *spare;
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
- struct cgroup_subsys_state css;
-
- /* Accounted resources */
- struct page_counter memory;
- struct page_counter memsw;
- struct page_counter kmem;
-
- /* Normal memory consumption range */
- unsigned long low;
- unsigned long high;
-
- unsigned long soft_limit;
-
- /* vmpressure notifications */
- struct vmpressure vmpressure;
-
- /* css_online() has been completed */
- int initialized;
-
- /*
- * Should the accounting and control be hierarchical, per subtree?
- */
- bool use_hierarchy;
-
- /* protected by memcg_oom_lock */
- bool oom_lock;
- int under_oom;
-
- int swappiness;
- /* OOM-Killer disable */
- int oom_kill_disable;
-
- /* protect arrays of thresholds */
- struct mutex thresholds_lock;
-
- /* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_thresholds thresholds;
-
- /* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_thresholds memsw_thresholds;
-
- /* For oom notifier event fd */
- struct list_head oom_notify;
-
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- struct task_struct *move_lock_task;
- unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params.memcg_caches array */
- int kmemcg_id;
- bool kmem_acct_activated;
- bool kmem_acct_active;
-#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct list_head cgwb_list;
- struct wb_domain cgwb_domain;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
- return memcg->kmem_acct_active;
-}
-#endif
-
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
+ if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return &memcg->css;
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -631,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
return &memcg->css;
}
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ino = 0;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(page->mem_cgroup);
+ while (memcg && !(memcg->css.flags & CSS_ONLINE))
+ memcg = parent_mem_cgroup(memcg);
+ if (memcg)
+ ino = cgroup_ino(memcg->css.cgroup);
+ rcu_read_unlock();
+ return ino;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -876,14 +714,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_zone *mz;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- return mz->lru_size[lru];
-}
-
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid,
unsigned int lru_mask)
@@ -986,6 +816,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
+EXPORT_SYMBOL(mem_cgroup_from_task);
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1031,7 +862,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1173,30 +1004,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- goto out;
-
- switch (idx) {
- case PGFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
- break;
- case PGMAJFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
- break;
- default:
- BUG();
- }
-out:
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
@@ -1295,15 +1102,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
- if (root == memcg)
- return true;
- if (!root->use_hierarchy)
- return false;
- return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
@@ -1330,39 +1128,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1159,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
return margin;
}
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}
-
/*
* A routine for checking "mem" is under move_account() or not.
*
@@ -1545,6 +1301,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1325,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1333,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1371,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
@@ -2062,23 +1823,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val)
-{
- VM_BUG_ON(!rcu_read_lock_held());
-
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
-}
-
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2355,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges. If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg;
- unsigned short id;
- swp_entry_t ent;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- memcg = page->mem_cgroup;
- if (memcg) {
- if (!css_tryget_online(&memcg->css))
- memcg = NULL;
- } else if (PageSwapCache(page)) {
- ent.val = page_private(page);
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
- return memcg;
-}
-
static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);
@@ -2504,16 +2214,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
- return memcg ? memcg->kmemcg_id : -1;
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -4194,20 +3894,23 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
struct fd efile;
struct fd cfile;
const char *name;
- char *endp;
int ret;
buf = strstrip(buf);
- efd = simple_strtoul(buf, &endp, 10);
- if (*endp != ' ')
+ ret = parse_integer(buf, 10, &efd);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ if (*buf++ != ' ')
return -EINVAL;
- buf = endp + 1;
-
- cfd = simple_strtoul(buf, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
+ ret = parse_integer(buf, 10, &cfd);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ if (*buf != ' ' && *buf != '\0')
return -EINVAL;
- buf = endp + 1;
+ buf++;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
@@ -5127,10 +4830,12 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
- int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *from;
+ struct task_struct *p;
+ struct mm_struct *mm;
unsigned long move_flags;
+ int ret = 0;
/*
* We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4843,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (move_flags) {
- struct mm_struct *mm;
- struct mem_cgroup *from = mem_cgroup_from_task(p);
+ if (!move_flags)
+ return 0;
- VM_BUG_ON(from == memcg);
+ p = cgroup_taskset_first(tset);
+ from = mem_cgroup_from_task(p);
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- }
- mmput(mm);
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
}
+ mmput(mm);
return ret;
}
@@ -5521,19 +5227,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx,
- unsigned int nr)
-{
- this_cpu_add(memcg->stat->events[idx], nr);
-}
-
-/**
* mem_cgroup_low - check if memory consumption is below the normal range
* @root: the highest ancestor to consider
* @memcg: the memory cgroup to check
@@ -5605,8 +5298,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page->mem_cgroup)
goto out;
+
+ if (do_swap_account) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id = lookup_swap_cgroup_id(ent);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
}
if (PageTransHuge(page)) {
@@ -5614,8 +5319,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
- if (do_swap_account && PageSwapCache(page))
- memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5965,7 +5668,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* Caller disabled preemption with mapping->tree_lock */
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * mapping->tree_lock lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for udpating the per-CPU variables.
+ */
+ VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..613389e9e5a8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
{
- struct mem_cgroup *mem;
- struct cgroup_subsys_state *css;
- unsigned long ino;
-
if (!hwpoison_filter_memcg)
return 0;
- mem = try_get_mem_cgroup_from_page(p);
- if (!mem)
- return -EINVAL;
-
- css = mem_cgroup_css(mem);
- ino = cgroup_ino(css->cgroup);
- css_put(css);
-
- if (ino != hwpoison_filter_memcg)
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -934,6 +922,27 @@ int get_hwpoison_page(struct page *page)
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head)) {
+ put_page(head);
+ return;
+ }
+
+ if (PageTransHuge(head))
+ if (page != head)
+ put_page(head);
+
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1109,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
nr_pages = 1 << compound_order(hpage);
else /* normal page or thp */
nr_pages = 1;
- atomic_long_add(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_add(nr_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -1128,7 +1137,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHWPoison(hpage)) {
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
return 0;
}
@@ -1152,10 +1161,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
else
pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
+ num_poisoned_pages_sub(nr_pages);
+ put_hwpoison_page(p);
return -EBUSY;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1165,7 +1172,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1214,16 +1221,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
@@ -1237,7 +1244,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
/*
@@ -1426,6 +1433,22 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
+ if (page_count(page) > 1) {
+ pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapped(page)) {
+ pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapping(page)) {
+ pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn);
+ return 0;
+ }
+
/*
* unpoison_memory() can encounter thp only when the thp is being
* worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1473,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
if (TestClearPageHWPoison(p))
- atomic_long_dec(&num_poisoned_pages);
+ num_poisoned_pages_dec();
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1464,16 +1487,16 @@ int unpoison_memory(unsigned long pfn)
*/
if (TestClearPageHWPoison(page)) {
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_page(page);
+ put_hwpoison_page(page);
return 0;
}
@@ -1533,7 +1556,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
/*
* Try to free it.
*/
- put_page(page);
+ put_hwpoison_page(page);
shake_page(page, 1);
/*
@@ -1542,7 +1565,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (!PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
@@ -1565,7 +1588,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
lock_page(hpage);
if (PageHWPoison(hpage)) {
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
@@ -1576,7 +1599,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
* get_any_page() and isolate_huge_page() takes a refcount each,
* so need to drop one here.
*/
- put_page(hpage);
+ put_hwpoison_page(hpage);
if (!ret) {
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
return -EBUSY;
@@ -1600,11 +1623,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
@@ -1625,7 +1647,7 @@ static int __soft_offline_page(struct page *page, int flags)
wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1640,10 +1662,10 @@ static int __soft_offline_page(struct page *page, int flags)
* would need to fix isolation locking first.
*/
if (ret == 1) {
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: invalidated\n", pfn);
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
return 0;
}
@@ -1657,14 +1679,12 @@ static int __soft_offline_page(struct page *page, int flags)
* Drop page reference which is came from get_any_page()
* successful isolate_lru_page() already took another one.
*/
- put_page(page);
+ put_hwpoison_page(page);
if (!ret) {
LIST_HEAD(pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1679,8 +1699,6 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- if (TestClearPageHWPoison(page))
- atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1737,16 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
if (!PageHuge(page) && PageTransHuge(hpage)) {
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
pr_info("soft offline: %#lx: failed to split THP\n",
pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
}
@@ -1742,11 +1764,10 @@ int soft_offline_page(struct page *page, int flags)
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
if (!dequeue_hwpoisoned_huge_page(hpage))
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9aa283..558ee16167d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
#include <linux/string.h>
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
#ifdef HAVE_GENERIC_MMU_GATHER
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
- return 1;
+ return true;
}
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
- return 0;
+ return false;
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
- return 0;
+ return false;
tlb->batch_count++;
batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
tlb->active->next = batch;
tlb->active = batch;
- return 1;
+ return true;
}
/* tlb_gather_mmu
@@ -2425,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
-
- /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2685,6 +2684,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
goto setpte;
}
@@ -2713,6 +2718,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ page_cache_release(page);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
+
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
@@ -2999,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
@@ -3015,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
@@ -3073,7 +3087,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3216,6 +3230,27 @@ out:
return 0;
}
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+ unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3251,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops)
+ if (vma_is_anonymous(vma))
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, flags);
+ else
return do_fault(mm, vma, address, pte, pmd,
flags, entry);
-
- return do_anonymous_page(mm, vma, address, pte, pmd,
- flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3318,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
@@ -3345,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
- ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
- orig_pmd);
+ ret = wp_huge_pmd(mm, vma, address, pmd,
+ orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6da82bcb0a8b..d8ba10f658e9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1073,16 +1073,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static void reset_node_present_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->present_pages = 0;
-
- pgdat->node_present_pages = 0;
-}
-
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
@@ -1117,21 +1107,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
- /*
- * zone->managed_pages is set to an approximate value in
- * free_area_init_core(), which will cause
- * /sys/device/system/node/nodeX/meminfo has wrong data.
- * So reset it to 0 before any memory is onlined.
- */
- reset_node_managed_pages(pgdat);
-
- /*
- * When memory is hot-added, all the memory is in offline state. So
- * clear all zones' present_pages because they will be updated in
- * online_pages() and offline_pages().
- */
- reset_node_present_pages(pgdat);
-
return pgdat;
}
@@ -1342,7 +1317,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1353,10 +1328,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
for (pfn = start_pfn;
pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
+ /* Find the first valid pfn in this pageblock */
+ for (i = 0; i < MAX_ORDER_NR_PAGES; i++) {
+ if (pfn_valid(pfn + i))
+ break;
+ }
if (i == MAX_ORDER_NR_PAGES)
continue;
page = pfn_to_page(pfn + i);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..d6f2caee28c0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
qp->prev = vma;
- if (vma->vm_flags & VM_PFNMAP)
- return 1;
-
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -722,8 +719,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol, vma->vm_userfaultfd_ctx);
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de8b1db..4c533bc51d73 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
*/
void mempool_destroy(mempool_t *pool)
{
+ if (unlikely(!pool))
+ return;
+
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc133f6d7..4b4f36b46371 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <linux/pfn.h>
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
+ pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
+ cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
}
@@ -79,22 +72,20 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
+ pr_info(" %pa - %pa pattern %016llx\n",
+ &this_start, &this_end, cpu_to_be64(pattern));
memtest(pattern, this_start, this_end - this_start);
}
}
}
/* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
+ parse_integer(arg, 0, (unsigned int *)&memtest_pattern);
else
memtest_pattern = ARRAY_SIZE(patterns);
@@ -111,7 +102,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
if (!memtest_pattern)
return;
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index eb4267107d1f..fbf17988ab5f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
#include <linux/gfp.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <asm/tlbflush.h>
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
+ if (page_is_young(page))
+ set_page_young(newpage);
+ if (page_is_idle(page))
+ set_page_idle(newpage);
+
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
@@ -880,8 +886,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_IGNORE_HWPOISON);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
@@ -952,9 +957,11 @@ out:
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE)
+ if (reason == MR_MEMORY_FAILURE) {
put_page(page);
- else
+ if (!test_set_page_hwpoison(page))
+ num_poisoned_pages_inc();
+ } else
putback_lru_page(page);
}
@@ -1226,7 +1233,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, pp->addr,
+ FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1236,10 +1245,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!page)
goto set_status;
- /* Use PageReserved to check for zero page */
- if (PageReserved(page))
- goto put_and_set;
-
pp->page = page;
err = page_to_nid(page);
@@ -1396,18 +1401,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (!vma || addr < vma->vm_start)
goto set_status;
- page = follow_page(vma, addr, 0);
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, addr, FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- err = -ENOENT;
- /* Use PageReserved to check for zero page */
- if (!page || PageReserved(page))
- goto set_status;
-
- err = page_to_nid(page);
+ err = page ? page_to_nid(page) : -ENOENT;
set_status:
*status = err;
@@ -1753,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..c0ac9f58b139 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
while (start < end) {
struct page *page = NULL;
@@ -506,11 +506,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
- goto out; /* don't set VM_LOCKED, don't count */
+ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+ goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
@@ -553,7 +555,8 @@ out:
return ret;
}
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+ vm_flags_t flags)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * prev;
@@ -575,14 +578,12 @@ static int do_mlock(unsigned long start, size_t len, int on)
prev = vma;
for (nstart = start ; ; ) {
- vm_flags_t newflags;
-
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ vm_flags_t newflags =
+ vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (on)
- newflags |= VM_LOCKED;
+ newflags |= flags;
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
@@ -604,7 +605,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
unsigned long lock_limit;
@@ -628,7 +629,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = do_mlock(start, len, 1);
+ error = apply_vma_lock_flags(start, len, flags);
up_write(&current->mm->mmap_sem);
if (error)
@@ -640,6 +641,24 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
return 0;
}
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+ vm_flags_t vm_flags = VM_LOCKED;
+
+ if (flags & ~MLOCK_ONFAULT)
+ return -EINVAL;
+
+ if (flags & MLOCK_ONFAULT)
+ vm_flags |= VM_LOCKONFAULT;
+
+ return do_mlock(start, len, vm_flags);
+}
+
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
@@ -648,29 +667,49 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
start &= PAGE_MASK;
down_write(&current->mm->mmap_sem);
- ret = do_mlock(start, len, 0);
+ ret = apply_vma_lock_flags(start, len, 0);
up_write(&current->mm->mmap_sem);
return ret;
}
-static int do_mlockall(int flags)
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this. If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack. If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
+static int apply_mlockall_flags(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
+ vm_flags_t to_add = 0;
- if (flags & MCL_FUTURE)
+ current->mm->def_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
- else
- current->mm->def_flags &= ~VM_LOCKED;
- if (flags == MCL_FUTURE)
- goto out;
+
+ if (flags & MCL_ONFAULT)
+ current->mm->def_flags |= VM_LOCKONFAULT;
+
+ if (!(flags & MCL_CURRENT))
+ goto out;
+ }
+
+ if (flags & MCL_CURRENT) {
+ to_add |= VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ to_add |= VM_LOCKONFAULT;
+ }
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
vm_flags_t newflags;
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (flags & MCL_CURRENT)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
+ newflags |= to_add;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -685,7 +724,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
unsigned long lock_limit;
int ret = -EINVAL;
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
+ flags == MCL_ONFAULT)
goto out;
ret = -EPERM;
@@ -703,7 +743,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
- ret = do_mlockall(flags);
+ ret = apply_mlockall_flags(flags);
up_write(&current->mm->mmap_sem);
if (!ret && (flags & MCL_CURRENT))
mm_populate(0, TASK_SIZE);
@@ -716,7 +756,7 @@ SYSCALL_DEFINE0(munlockall)
int ret;
down_write(&current->mm->mmap_sem);
- ret = do_mlockall(0);
+ ret = apply_mlockall_flags(0);
up_write(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..76ada5dd968b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end);
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ return 0;
return 1;
}
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ anon_vma, file, pgoff,
+ vm_userfaultfd_ctx)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ anon_vma, file,
+ pgoff+pglen,
+ vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
- NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -1651,7 +1664,7 @@ out:
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
}
if (file)
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -2871,7 +2884,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
- if (!vma->vm_file) {
+ if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
@@ -2905,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3013,21 +3027,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- /*
- * special mappings have no vm_file, and in that case, the mm
- * uses vm_pgoff internally. So we have to subtract it from here.
- * We are allowed to do this because we are the mm; do not copy
- * this code into drivers!
- */
- pgoff = vmf->pgoff - vma->vm_pgoff;
-
if (vma->vm_ops == &legacy_special_mapping_vmops)
pages = vma->vm_private_data;
else
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
pages;
- for (; pgoff && *pages; ++pages)
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->clear_young)
+ young |= mn->ops->clear_young(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*pprev) {
vma = *pprev;
goto success;
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c93eceb1c8..5a71cce8c6ea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
+ err = -ENOMEM;
+ } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ err = vma->vm_ops->mremap(new_vma);
+ }
+
+ if (unlikely(err)) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
- new_addr = -ENOMEM;
+ new_addr = err;
} else {
- if (vma->vm_file && vma->vm_file->f_op->mremap) {
- err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
- if (err < 0) {
- move_page_tables(new_vma, new_addr, vma,
- old_addr, moved_len, true);
- return err;
- }
- }
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
}
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = find_vma(mm, addr);
+ unsigned long pgoff;
if (!vma || vma->vm_start > addr)
return ERR_PTR(-EFAULT);
@@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
return ERR_PTR(-EFAULT);
+ if (new_len == old_len)
+ return vma;
+
/* Need to be careful about a growing mapping */
- if (new_len > old_len) {
- unsigned long pgoff;
-
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return ERR_PTR(-EFAULT);
- pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgoff += vma->vm_pgoff;
- if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return ERR_PTR(-EINVAL);
- }
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ return ERR_PTR(-EINVAL);
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ return ERR_PTR(-EFAULT);
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
goto out;
- /* Check if the location we're moving into overlaps the
- * old location at all, and fail if it does.
- */
- if ((new_addr <= addr) && (new_addr+new_len) > addr)
- goto out;
-
- if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ /* Ensure the old/new locations do not overlap */
+ if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
ret = do_munmap(mm, new_addr, new_len);
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
}
out:
- if (ret & ~PAGE_MASK)
+ if (ret & ~PAGE_MASK) {
vm_unacct_memory(charged);
+ locked = 0;
+ }
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..1ecc0bcaecc5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;
- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;
- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}
/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
- int killed = 0;
if (oom_killer_disabled)
return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- goto out;
+ return true;
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
- goto out;
+ return true;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
- goto out;
+ return true;
}
- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ if (!p && oc->order != -1) {
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
- killed = 1;
- }
-out:
- /*
- * Give the killed threads a good chance of exiting before trying to
- * allocate memory again.
- */
- if (killed)
+ if (p && p != (void *)-1UL) {
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
schedule_timeout_killable(1);
-
+ }
return true;
}
@@ -728,13 +711,20 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;
if (!mutex_trylock(&oom_lock))
return;
- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b5240b7f642..c1024db4ac5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
@@ -443,6 +461,7 @@ void prep_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
p->first_page = page;
/* Make sure p->first_page is always valid for PageTail() */
smp_wmb();
@@ -788,7 +807,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- mt = get_freepage_migratetype(page);
+
+ mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
@@ -821,6 +844,12 @@ static void free_one_page(struct zone *zone,
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ page->mapping = NULL;
+ return 1;
+ }
+ page->mapping = NULL;
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return 0;
if (unlikely(!PageTail(page))) {
@@ -952,7 +981,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- set_freepage_migratetype(page, migratetype);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -1380,7 +1408,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -1457,7 +1485,6 @@ int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
- set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1627,14 +1654,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
expand(zone, page, order, current_order, area,
start_migratetype);
/*
- * The freepage_migratetype may differ from pageblock's
+ * The pcppage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
+ * find_suitable_fallback(). This is OK as long as it does not
+ * differ for MIGRATE_CMA pageblocks. Those can be used as
+ * fallback only via special __rmqueue_cma_fallback() function
*/
- set_freepage_migratetype(page, start_migratetype);
+ set_pcppage_migratetype(page, start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -1710,7 +1736,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1907,7 +1933,7 @@ void free_hot_cold_page(struct page *page, bool cold)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -2112,7 +2138,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pcppage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2693,6 +2719,12 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct page *page;
*did_some_progress = 0;
@@ -2744,8 +2776,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);
@@ -5303,8 +5334,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
*
* NOTE: pgdat should get zeroed by caller.
*/
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5467,7 +5497,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn);
+ free_area_init_core(pgdat);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5478,11 +5508,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
*/
void __init setup_nr_node_ids(void)
{
- unsigned int node;
- unsigned int highest = 0;
+ unsigned int highest;
- for_each_node_mask(node, node_possible_map)
- highest = node;
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
@@ -6423,7 +6451,7 @@ static int __init set_hashdist(char *str)
{
if (!str)
return 0;
- hashdist = simple_strtoul(str, &str, 0);
+ parse_integer(str, 0, (unsigned int *)&hashdist);
return 1;
}
__setup("hashdist=", set_hashdist);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d86fd2f5353f..292ca7b8debd 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -6,6 +6,7 @@
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
+#include <linux/page_idle.h>
/*
* struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+ &page_idle_ops,
+#endif
};
static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644
index 000000000000..d5dd79041484
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,232 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE sizeof(u64)
+#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+ struct page *page;
+ struct zone *zone;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (!page || !PageLRU(page) ||
+ !get_page_unless_zero(page))
+ return NULL;
+
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (unlikely(!PageLRU(page))) {
+ put_page(page);
+ page = NULL;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool referenced = false;
+
+ if (unlikely(PageTransHuge(page))) {
+ pmd = page_check_address_pmd(page, mm, addr,
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ if (pmd) {
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ spin_unlock(ptl);
+ }
+ } else {
+ pte = page_check_address(page, mm, addr, &ptl, 0);
+ if (pte) {
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap_unlock(pte, ptl);
+ }
+ }
+ if (referenced) {
+ clear_page_idle(page);
+ /*
+ * We cleared the referenced bit in a mapping to this page. To
+ * avoid interference with page reclaim, mark it young so that
+ * page_referenced() will return > 0.
+ */
+ set_page_young(page);
+ }
+ return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+ /*
+ * Since rwc.arg is unused, rwc is effectively immutable, so we
+ * can make it static const to save some cycles and stack.
+ */
+ static const struct rmap_walk_control rwc = {
+ .rmap_one = page_idle_clear_pte_refs_one,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!page_mapped(page) ||
+ !page_rmapping(page))
+ return;
+
+ need_lock = !PageAnon(page) || PageKsm(page);
+ if (need_lock && !trylock_page(page))
+ return;
+
+ rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+ if (need_lock)
+ unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t pos, size_t count)
+{
+ u64 *out = (u64 *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ int bit;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ pfn = pos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return 0;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % BITMAP_CHUNK_BITS;
+ if (!bit)
+ *out = 0ULL;
+ page = page_idle_get_page(pfn);
+ if (page) {
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ page_idle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ *out |= 1ULL << bit;
+ }
+ put_page(page);
+ }
+ if (bit == BITMAP_CHUNK_BITS - 1)
+ out++;
+ cond_resched();
+ }
+ return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t pos, size_t count)
+{
+ const u64 *in = (u64 *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ int bit;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ pfn = pos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return -ENXIO;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % BITMAP_CHUNK_BITS;
+ if ((*in >> bit) & 1) {
+ page = page_idle_get_page(pfn);
+ if (page) {
+ page_idle_clear_pte_refs(page);
+ set_page_idle(page);
+ put_page(page);
+ }
+ }
+ if (bit == BITMAP_CHUNK_BITS - 1)
+ in++;
+ cond_resched();
+ }
+ return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+ __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+ page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+ &page_idle_bitmap_attr,
+ NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+ .bin_attrs = page_idle_bin_attrs,
+ .name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+ return true;
+}
+struct page_ext_operations page_idle_ops = {
+ .need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+ int err;
+
+ err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+ if (err) {
+ pr_err("page_idle: register sysfs failed\n");
+ return err;
+ }
+ return 0;
+}
+subsys_initcall(page_idle_init);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790ef..47dec0508c6e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
#include <linux/hugetlb.h>
#include "internal.h"
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+ bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
return ret;
}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
@@ -178,8 +179,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page)
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
@@ -223,34 +227,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page)) {
+ if (PageBuddy(page))
/*
- * If race between isolatation and allocation happens,
- * some free pages could be in MIGRATE_MOVABLE list
- * although pageblock's migratation type of the page
- * is MIGRATE_ISOLATE. Catch it and move the page into
- * MIGRATE_ISOLATE list.
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
*/
- if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
- struct page *end_page;
-
- end_page = page + (1 << page_order(page)) - 1;
- move_freepages(page_zone(page), page, end_page,
- MIGRATE_ISOLATE);
- }
pfn += 1 << page_order(page);
- }
- else if (page_count(page) == 0 &&
- get_freepage_migratetype(page) == MIGRATE_ISOLATE)
- pfn += 1;
- else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
- /*
- * The HWPoisoned page may be not in buddy
- * system, and page_count() is not 0.
- */
+ else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- continue;
- }
else
break;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..dbda882920a0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,9 +59,12 @@
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
+#include <linux/page_idle.h>
#include <asm/tlbflush.h>
+#include <trace/events/tlb.h>
+
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +586,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
return address;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+ /*
+ * All TLB entries are flushed on the assumption that it is
+ * cheaper to flush all TLBs and let them be refilled than
+ * flushing individual PFNs. Note that we do not track mm's
+ * to flush as that might simply be multiple full TLB flushes
+ * for no gain.
+ */
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+ int cpu;
+
+ if (!tlb_ubc->flush_required)
+ return;
+
+ cpu = get_cpu();
+
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+ percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+ smp_call_function_many(&tlb_ubc->cpumask,
+ percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ }
+ cpumask_clear(&tlb_ubc->cpumask);
+ tlb_ubc->flush_required = false;
+ tlb_ubc->writable = false;
+ put_cpu();
+}
+
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ if (tlb_ubc->writable)
+ try_to_unmap_flush();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page, bool writable)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ tlb_ubc->flush_required = true;
+
+ /*
+ * If the PTE was dirty then it's best to assume it's writable. The
+ * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+ * before the page is queued for IO.
+ */
+ if (writable)
+ tlb_ubc->writable = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ bool should_defer = false;
+
+ if (!(flags & TTU_BATCH_FLUSH))
+ return false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page, bool writable)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* At what user virtual address is page expected in vma?
* Caller should check the page is actually part of the vma.
@@ -714,6 +818,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
}
struct page_referenced_arg {
+ int dirtied;
int mapcount;
int referenced;
unsigned long vm_flags;
@@ -728,6 +833,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ int dirty = 0;
struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
@@ -744,13 +850,23 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
+ pra->vm_flags |=
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT));
return SWAP_FAIL; /* To break the loop */
}
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+
+ /*
+ * Use pmd_freeable instead of raw pmd_dirty because in some
+ * of architecture, pmd_dirty is not defined unless
+ * CONFIG_TRANSPARENT_HUGEPAGE is enabled
+ */
+ if (!pmd_freeable(*pmd))
+ dirty++;
+
spin_unlock(ptl);
} else {
pte_t *pte;
@@ -765,7 +881,8 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
+ pra->vm_flags |=
+ (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT));
return SWAP_FAIL; /* To break the loop */
}
@@ -780,14 +897,26 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
+
+ if (pte_dirty(*pte))
+ dirty++;
+
pte_unmap_unlock(pte, ptl);
}
+ if (referenced)
+ clear_page_idle(page);
+ if (test_and_clear_page_young(page))
+ referenced++;
+
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags;
}
+ if (dirty)
+ pra->dirtied++;
+
pra->mapcount--;
if (!pra->mapcount)
return SWAP_SUCCESS; /* To break the loop */
@@ -812,6 +941,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
* @is_locked: caller holds lock on the page
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
@@ -819,7 +949,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
int ret;
int we_locked = 0;
@@ -834,6 +965,9 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
+
if (!page_mapped(page))
return 0;
@@ -861,6 +995,9 @@ int page_referenced(struct page *page,
if (we_locked)
unlock_page(page);
+ if (is_pte_dirty)
+ *is_pte_dirty = pra.dirtied;
+
return pra.referenced;
}
@@ -1194,6 +1331,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ int dirty = 0;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1220,17 +1358,33 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush(vma, address, pte);
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially a remote
+ * CPU could still be writing to the page. If the entry was
+ * previously clean then the architecture must guarantee that
+ * a clear->dirty transition on a cached TLB entry is written
+ * through and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pte);
+
+ set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pte);
+ }
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
+ dirty = pte_dirty(pteval);
+ if (dirty)
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (!PageHuge(page)) {
+ if (PageHuge(page)) {
+ dec_hugetlb_count(mm, page_hstate(page));
+ } else {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
@@ -1252,6 +1406,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
+ if (flags & TTU_FREE) {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!dirty && !PageDirty(page)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ } else {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
+
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
@@ -1292,6 +1459,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, MM_FILEPAGES);
+discard:
page_remove_rmap(page);
page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index dbe0c1e8349c..aa9c82a6b406 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1173,7 +1173,7 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
@@ -2736,6 +2736,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
struct mempolicy *mpol = NULL;
uid_t uid;
gid_t gid;
+ int rv;
while (options != NULL) {
this_char = options;
@@ -2789,14 +2790,15 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
} else if (!strcmp(this_char,"mode")) {
if (remount)
continue;
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
- if (*rest)
+ rv = parse_integer(value, 8, &sbinfo->mode);
+ if (rv < 0 || value[rv])
goto bad_val;
+ sbinfo->mode &= 07777;
} else if (!strcmp(this_char,"uid")) {
if (remount)
continue;
- uid = simple_strtoul(value, &rest, 0);
- if (*rest)
+ rv = parse_integer(value, 0, &uid);
+ if (rv < 0 || value[rv])
goto bad_val;
sbinfo->uid = make_kuid(current_user_ns(), uid);
if (!uid_valid(sbinfo->uid))
@@ -2804,8 +2806,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
} else if (!strcmp(this_char,"gid")) {
if (remount)
continue;
- gid = simple_strtoul(value, &rest, 0);
- if (*rest)
+ rv = parse_integer(value, 0, &gid);
+ if (rv < 0 || value[rv])
goto bad_val;
sbinfo->gid = make_kgid(current_user_ns(), gid);
if (!gid_valid(sbinfo->gid))
diff --git a/mm/slab.c b/mm/slab.c
index bbd0b47dc6a9..bf7169c9882f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2190,7 +2190,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= kmalloc_size(INDEX_NODE + 1)
+ if (size >= kmalloc_size(INDEX_NODE) * 2
&& cachep->object_size > cache_line_size()
&& ALIGN(size, cachep->align) < PAGE_SIZE) {
cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
#ifdef CONFIG_TRACING
void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..88b55497738c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_MEMCG_KMEM
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 86831105a09f..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
}
#endif
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++)
+ kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+ void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ }
+ return true;
+}
+
#ifdef CONFIG_MEMCG_KMEM
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -477,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct cgroup_subsys_state *css = &memcg->css;
struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
@@ -617,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
bool need_rcu_barrier = false;
bool busy = false;
+ if (unlikely(!s))
+ return;
+
BUG_ON(!is_root_cache(s));
get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
int __kmem_cache_shutdown(struct kmem_cache *c)
{
/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index f68c0e50f3c0..8987bd5cb90f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -1306,6 +1308,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kasan_slab_free(s, x);
}
+static void setup_object(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ setup_object_debug(s, page, object);
+ if (unlikely(s->ctor)) {
+ kasan_unpoison_object_data(s, object);
+ s->ctor(object);
+ kasan_poison_object_data(s, object);
+ }
+}
+
/*
* Slab allocation and freeing
*/
@@ -1336,6 +1349,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
+ void *start, *p;
+ int idx, order;
flags &= gfp_allowed_mask;
@@ -1349,6 +1364,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* so we fall-back to the minimum order allocation.
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+ if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1359,13 +1376,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Try a lower order alloc if possible
*/
page = alloc_slab_page(s, alloc_gfp, node, oo);
-
- if (page)
- stat(s, ORDER_FALLBACK);
+ if (unlikely(!page))
+ goto out;
+ stat(s, ORDER_FALLBACK);
}
- if (kmemcheck_enabled && page
- && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+ if (kmemcheck_enabled &&
+ !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1397,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kmemcheck_mark_unallocated_pages(page, pages);
}
- if (flags & __GFP_WAIT)
- local_irq_disable();
- if (!page)
- return NULL;
-
page->objects = oo_objects(oo);
- mod_zone_page_state(page_zone(page),
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- 1 << oo_order(oo));
-
- return page;
-}
-
-static void setup_object(struct kmem_cache *s, struct page *page,
- void *object)
-{
- setup_object_debug(s, page, object);
- if (unlikely(s->ctor)) {
- kasan_unpoison_object_data(s, object);
- s->ctor(object);
- kasan_poison_object_data(s, object);
- }
-}
-
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
- struct page *page;
- void *start;
- void *p;
- int order;
- int idx;
-
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
- BUG();
- }
-
- page = allocate_slab(s,
- flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
- if (!page)
- goto out;
order = compound_order(page);
- inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
@@ -1448,10 +1423,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->freelist = start;
page->inuse = page->objects;
page->frozen = 1;
+
out:
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
+ if (!page)
+ return NULL;
+
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ 1 << oo_order(oo));
+
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+
return page;
}
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+ pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+ BUG();
+ }
+
+ return allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
static void __free_slab(struct kmem_cache *s, struct page *page)
{
int order = compound_order(page);
@@ -2712,7 +2711,7 @@ redo:
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
* data is retrieved via this pointer. If we are on the same cpu
- * during the cmpxchg then the free will succedd.
+ * during the cmpxchg then the free will succeed.
*/
do {
tid = this_cpu_read(s->cpu_slab->tid);
@@ -2750,6 +2749,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct kmem_cache_cpu *c;
+ struct page *page;
+ int i;
+
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ BUG_ON(!object);
+ /* kmem cache debug support */
+ s = cache_from_obj(s, object);
+ if (unlikely(!s))
+ goto exit;
+ slab_free_hook(s, object);
+
+ page = virt_to_head_page(object);
+
+ if (c->page == page) {
+ /* Fastpath: local CPU free */
+ set_freepointer(s, object, c->freelist);
+ c->freelist = object;
+ } else {
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+ /* Slowpath: overhead locked cmpxchg_double_slab */
+ __slab_free(s, page, object, _RET_IP_);
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+ }
+ }
+exit:
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+/* Note that interrupts must be enabled when calling this function. */
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ struct kmem_cache_cpu *c;
+ int i;
+
+ /*
+ * Drain objects in the per cpu slab, while disabling local
+ * IRQs, which protects against PREEMPT and interrupts
+ * handlers invoking normal fastpath.
+ */
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ for (i = 0; i < size; i++) {
+ void *object = c->freelist;
+
+ if (unlikely(!object)) {
+ local_irq_enable();
+ /*
+ * Invoking slow path likely have side-effect
+ * of re-populating per CPU c->freelist
+ */
+ p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+ _RET_IP_, c);
+ if (unlikely(!p[i])) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+ continue; /* goto for-loop */
+ }
+
+ /* kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, flags);
+ if (unlikely(!s)) {
+ __kmem_cache_free_bulk(s, i, p);
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+ return false;
+ }
+
+ c->freelist = get_freepointer(s, object);
+ p[i] = object;
+
+ /* kmem_cache debug support */
+ slab_post_alloc_hook(s, flags, object);
+ }
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+
+ /* Clear memory outside IRQ disabled fastpath loop */
+ if (unlikely(flags & __GFP_ZERO)) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ memset(p[j], 0, s->object_size);
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -5181,7 +5287,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = cache_kset(s);
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
if (err)
- goto out_put_kobj;
+ goto out;
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
@@ -5208,8 +5314,6 @@ out:
return err;
out_del_kobj:
kobject_del(&s->kobj);
-out_put_kobj:
- kobject_put(&s->kobj);
goto out;
}
diff --git a/mm/swap.c b/mm/swap.c
index a3a0a2f1f7c3..4a6aec976ab1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
+#include <linux/page_idle.h>
#include "internal.h"
@@ -44,6 +45,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -622,6 +624,8 @@ void mark_page_accessed(struct page *page)
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
+ if (page_is_idle(page))
+ clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);
@@ -796,6 +800,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -822,6 +844,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
@@ -851,6 +877,26 @@ void deactivate_file_page(struct page *page)
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
@@ -880,6 +926,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d783872d746c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
return page;
}
-/*
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated)
{
struct page *found_page, *new_page = NULL;
+ struct address_space *swapper_space = swap_address_space(entry);
int err;
+ *new_page_allocated = false;
do {
/*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swap_address_space(entry),
- entry.val);
+ found_page = find_get_page(swapper_space, entry.val);
if (found_page)
break;
@@ -357,7 +353,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -366,12 +362,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Initiate read into locked page and return.
*/
lru_cache_add_anon(new_page);
- swap_readpage(new_page);
+ *new_page_allocated = true;
return new_page;
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool page_was_allocated;
+ struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+ vma, addr, &page_was_allocated);
+
+ if (page_was_allocated)
+ swap_readpage(retpage);
+
+ return retpage;
+}
+
static unsigned long swapin_nr_pages(unsigned long offset)
{
static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e649..58877312cf6b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..77fee9325a57
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,308 @@
+/*
+ * mm/userfaultfd.c
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct mem_cgroup *memcg;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+ if (!page)
+ goto out;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ ret = -ENOMEM;
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ goto out_release;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ inc_mm_counter(dst_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+out_release:
+ page_cache_release(page);
+ goto out;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, address);
+ pud = pud_alloc(mm, pgd, address);
+ if (pud)
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ pmd = pmd_alloc(mm, pud, address);
+ return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ struct vm_area_struct *dst_vma;
+ ssize_t err;
+ pmd_t *dst_pmd;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(dst_start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(src_start + len <= src_start);
+ BUG_ON(dst_start + len <= dst_start);
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+retry:
+ down_read(&dst_mm->mmap_sem);
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -EINVAL;
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ /*
+ * Be strict and only allow __mcopy_atomic on userfaultfd
+ * registered ranges to prevent userland errors going
+ * unnoticed. As far as the VM consistency is concerned, it
+ * would be perfectly safe to remove this check, but there's
+ * no useful usage for __mcopy_atomic ouside of userfaultfd
+ * registered ranges. This is after all why these are ioctls
+ * belonging to the userfaultfd and not syscalls.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ /*
+ * FIXME: only allow copying on anonymous vmas, tmpfs should
+ * be added.
+ */
+ if (dst_vma->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Ensure the dst_vma has a anon_vma or this page
+ * would get a NULL anon_vma when moved in the
+ * dst_vma.
+ */
+ err = -ENOMEM;
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+
+ while (src_addr < src_start + len) {
+ pmd_t dst_pmdval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+
+ dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+ if (unlikely(!dst_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ dst_pmdval = pmd_read_atomic(dst_pmd);
+ /*
+ * If the dst_pmd is mapped as THP don't
+ * override it and just be strict.
+ */
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+ dst_addr))) {
+ err = -ENOMEM;
+ break;
+ }
+ /* If an huge pmd materialized from under us fail */
+ if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ err = -EFAULT;
+ break;
+ }
+
+ BUG_ON(pmd_none(*dst_pmd));
+ BUG_ON(pmd_trans_huge(*dst_pmd));
+
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, &page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr);
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ void *page_kaddr;
+
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ page_kaddr = kmap(page);
+ err = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap(page);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += PAGE_SIZE;
+ src_addr += PAGE_SIZE;
+ copied += PAGE_SIZE;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page)
+ page_cache_release(page);
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
diff --git a/mm/util.c b/mm/util.c
index 68ff8a5361e7..c7434060039b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..110733a715f6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,7 +36,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
+#include <linux/srcu.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -146,8 +146,9 @@ int vm_swappiness = 60;
*/
unsigned long vm_total_pages;
+DEFINE_STATIC_SRCU(shrinker_srcu);
static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+static DEFINE_SPINLOCK(shrinker_list_lock);
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
@@ -175,7 +176,7 @@ static bool sane_reclaim(struct scan_control *sc)
if (!memcg)
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
- if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ if (memcg->css.cgroup)
return true;
#endif
return false;
@@ -242,9 +243,9 @@ int register_shrinker(struct shrinker *shrinker)
if (!shrinker->nr_deferred)
return -ENOMEM;
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
+ spin_lock(&shrinker_list_lock);
+ list_add_tail_rcu(&shrinker->list, &shrinker_list);
+ spin_unlock(&shrinker_list_lock);
return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -254,9 +255,14 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
- down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
- up_write(&shrinker_rwsem);
+ spin_lock(&shrinker_list_lock);
+ list_del_rcu(&shrinker->list);
+ spin_unlock(&shrinker_list_lock);
+ /*
+ * Before freeing nr_deferred, ensure all srcu
+ * readers are done with their critical region.
+ */
+ synchronize_srcu(&shrinker_srcu);
kfree(shrinker->nr_deferred);
}
EXPORT_SYMBOL(unregister_shrinker);
@@ -408,6 +414,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
unsigned long nr_scanned,
unsigned long nr_eligible)
{
+ int idx;
struct shrinker *shrinker;
unsigned long freed = 0;
@@ -417,18 +424,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (nr_scanned == 0)
nr_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem)) {
- /*
- * If we would return 0, our callers would understand that we
- * have nothing else to shrink and give up trying. By returning
- * 1 we keep it going and assume we'll be able to shrink next
- * time.
- */
- freed = 1;
- goto out;
- }
+ idx = srcu_read_lock(&shrinker_srcu);
- list_for_each_entry(shrinker, &shrinker_list, list) {
+ list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -444,8 +442,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
- up_read(&shrinker_rwsem);
-out:
+ srcu_read_unlock(&shrinker_srcu, idx);
cond_resched();
return freed;
}
@@ -791,13 +788,17 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct scan_control *sc)
+ struct scan_control *sc,
+ bool *freeable)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
+ int pte_dirty;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
+ &vm_flags, &pte_dirty);
referenced_page = TestClearPageReferenced(page);
/*
@@ -838,6 +839,10 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_KEEP;
}
+ if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) &&
+ !PageDirty(page))
+ *freeable = true;
+
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
@@ -906,6 +911,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool freeable = false;
cond_resched();
@@ -1025,7 +1031,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (!force_reclaim)
- references = page_check_references(page, sc);
+ references = page_check_references(page, sc,
+ &freeable);
switch (references) {
case PAGEREF_ACTIVATE:
@@ -1042,22 +1049,31 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (!add_to_swap(page, page_list))
- goto activate_locked;
- may_enter_fs = 1;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ if (!freeable) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (!add_to_swap(page, page_list))
+ goto activate_locked;
+ may_enter_fs = 1;
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ } else {
+ if (likely(!PageTransHuge(page)))
+ goto unmap;
+ /* try_to_unmap isn't aware of THP page */
+ if (unlikely(split_huge_page_to_list(page,
+ page_list)))
+ goto keep_locked;
+ }
}
-
+unmap:
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ if (page_mapped(page) && (mapping || freeable)) {
+ switch (try_to_unmap(page, freeable ?
+ TTU_FREE : ttu_flags|TTU_BATCH_FLUSH)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1065,7 +1081,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case SWAP_MLOCK:
goto cull_mlocked;
case SWAP_SUCCESS:
- ; /* try to free the page below */
+ /* try to free the page below */
+ if (!freeable)
+ break;
+ /*
+ * Freeable anon page doesn't have mapping
+ * due to skipping of swapcache so we free
+ * page in here rather than __remove_mapping.
+ */
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!page_freeze_refs(page, 1))
+ goto keep_locked;
+ __ClearPageLocked(page);
+ count_vm_event(PGLAZYFREED);
+ goto free_it;
}
}
@@ -1097,7 +1126,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!sc->may_writepage)
goto keep_locked;
- /* Page is dirty, try to write it out here */
+ /*
+ * Page is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after IO
+ * starts and then write it out here.
+ */
+ try_to_unmap_flush_dirty();
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
goto keep_locked;
@@ -1175,7 +1209,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1190,7 +1224,7 @@ cull_mlocked:
if (PageSwapCache(page))
try_to_free_swap(page);
unlock_page(page);
- putback_lru_page(page);
+ list_add(&page->lru, &ret_pages);
continue;
activate_locked:
@@ -1208,6 +1242,7 @@ keep:
}
mem_cgroup_uncharge_list(&free_pages);
+ try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
@@ -1352,7 +1387,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long scan;
- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+ !list_empty(src); scan++) {
struct page *page;
int nr_pages;
@@ -1434,6 +1470,32 @@ int isolate_lru_page(struct page *page)
return ret;
}
+static int __too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc, int safe)
+{
+ unsigned long inactive, isolated;
+
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_ANON + file);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
@@ -1442,33 +1504,24 @@ int isolate_lru_page(struct page *page)
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
- struct scan_control *sc)
+ struct scan_control *sc)
{
- unsigned long inactive, isolated;
-
if (current_is_kswapd())
return 0;
if (!sane_reclaim(sc))
return 0;
- if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
- } else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
- }
-
/*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
+ * __too_many_isolated(safe=0) is fast but inaccurate, because it
+ * doesn't account for the vm_stat_diff[] counters. So if it looks
+ * like too_many_isolated() is about to return true, fall back to the
+ * slower, more accurate zone_page_state_snapshot().
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
- inactive >>= 3;
+ if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+ return __too_many_isolated(zone, file, sc, 1);
- return isolated > inactive;
+ return 0;
}
static noinline_for_stack void
@@ -1805,7 +1858,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags, NULL)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
@@ -2151,6 +2204,23 @@ out:
}
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+ /*
+ * This deliberately does not clear the cpumask as it's expensive
+ * and unnecessary. If there happens to be data in there then the
+ * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+ * then will be cleared.
+ */
+ current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
@@ -2185,6 +2255,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
+ init_tlb_ubc();
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f5cd974e11a..1fd0886a389f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -759,6 +759,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
struct list_head buddied;
struct list_head lru;
u64 pages_nr;
- struct zbud_ops *ops;
+ const struct zbud_ops *ops;
#ifdef CONFIG_ZPOOL
struct zpool *zpool;
- struct zpool_ops *zpool_ops;
+ const struct zpool_ops *zpool_ops;
#endif
};
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
return -ENOENT;
}
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..d8cf7cdece9a 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
struct zpool_driver *driver;
void *pool;
- struct zpool_ops *ops;
+ const struct zpool_ops *ops;
struct list_head list;
};
@@ -100,6 +100,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
}
/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available. This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling. If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure. However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+ struct zpool_driver *driver = zpool_get_driver(type);
+
+ if (!driver) {
+ request_module("zpool-%s", type);
+ driver = zpool_get_driver(type);
+ }
+
+ if (!driver)
+ return false;
+
+ zpool_put_driver(driver);
+ return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
+/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
* @name The name of the zpool (e.g. zram0, zswap)
@@ -115,7 +148,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
* Returns: New zpool on success, NULL on failure.
*/
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
- struct zpool_ops *ops)
+ const struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81aa2249..f135b1b6fcdc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@ enum zs_stat_type {
NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static struct dentry *zs_stat_root;
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
#endif
/*
@@ -201,6 +199,8 @@ static int zs_size_classes;
static const int fullness_threshold_frac = 4;
struct size_class {
+ spinlock_t lock;
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
- bool huge;
-
-#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
-#endif
-
- spinlock_t lock;
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
};
/*
@@ -251,6 +245,15 @@ struct zs_pool {
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+ struct zs_pool_stats stats;
+
+ /* Compact classes */
+ struct shrinker shrinker;
+ /*
+ * To signify that register_shrinker() was successful
+ * and unregister_shrinker() will not Oops.
+ */
+ bool shrinker_enabled;
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
static void destroy_handle_cache(struct zs_pool *pool)
{
- if (pool->handle_cachep)
- kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->handle_cachep);
}
static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@ static int get_size_class_index(int size)
return min(zs_size_classes - 1, idx);
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
@@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
return class->stats.objs[type];
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
static int __init zs_stat_init(void)
{
if (!debugfs_initialized())
@@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
}
#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
static int __init zs_stat_init(void)
{
return 0;
@@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
{
}
-
#endif
@@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
- head = &class->fullness_list[fullness];
- if (*head)
- list_add_tail(&page->lru, &(*head)->lru);
-
- *head = page;
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
+ head = &class->fullness_list[fullness];
+ if (!*head) {
+ *head = page;
+ return;
+ }
+
+ /*
+ * We want to see more ZS_FULL pages and less almost
+ * empty/full. Put pages with higher ->inuse first.
+ */
+ list_add_tail(&page->lru, &(*head)->lru);
+ if (page->inuse >= (*head)->inuse)
+ *head = page;
}
/*
@@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
struct size_class *class)
{
struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@ struct zs_compact_control {
/* Starting object index within @s_page which used for live object
* in the subpage. */
int index;
- /* how many of objects are migrated */
- int nr_migrated;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
unsigned long index = cc->index;
- int nr_migrated = 0;
int ret = 0;
while (1) {
@@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(used_obj, free_obj, class);
+ zs_object_copy(free_obj, used_obj, class);
index++;
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);
- nr_migrated++;
}
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->index = index;
- cc->nr_migrated = nr_migrated;
return ret;
}
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
{
int i;
struct page *page;
@@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
return page;
}
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
- struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ struct page *first_page)
{
enum fullness_group fullness;
@@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
free_zspage(first_page);
}
+
+ return fullness;
}
static struct page *isolate_source_page(struct size_class *class)
{
- struct page *page;
+ int i;
+ struct page *page = NULL;
- page = class->fullness_list[ZS_ALMOST_EMPTY];
- if (page)
- remove_zspage(page, class, ZS_ALMOST_EMPTY);
+ for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+ page = class->fullness_list[i];
+ if (!page)
+ continue;
+
+ remove_zspage(page, class, i);
+ break;
+ }
return page;
}
-static unsigned long __zs_compact(struct zs_pool *pool,
- struct size_class *class)
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+ unsigned long obj_wasted;
+
+ obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+ zs_stat_get(class, OBJ_USED);
+
+ obj_wasted /= get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+
+ return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
- int nr_to_migrate;
struct zs_compact_control cc;
struct page *src_page;
struct page *dst_page = NULL;
- unsigned long nr_total_migrated = 0;
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
BUG_ON(!is_first_page(src_page));
- /* The goal is to migrate all live objects in source page */
- nr_to_migrate = src_page->inuse;
+ if (!zs_can_compact(class))
+ break;
+
cc.index = 0;
cc.s_page = src_page;
- while ((dst_page = alloc_target_page(class))) {
+ while ((dst_page = isolate_target_page(class))) {
cc.d_page = dst_page;
/*
- * If there is no more space in dst_page, try to
- * allocate another zspage.
+ * If there is no more space in dst_page, resched
+ * and see if anyone had allocated another zspage.
*/
if (!migrate_zspage(pool, class, &cc))
break;
putback_zspage(pool, class, dst_page);
- nr_total_migrated += cc.nr_migrated;
- nr_to_migrate -= cc.nr_migrated;
}
/* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
break;
putback_zspage(pool, class, dst_page);
- putback_zspage(pool, class, src_page);
+ if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ pool->stats.pages_compacted += class->pages_per_zspage;
spin_unlock(&class->lock);
- nr_total_migrated += cc.nr_migrated;
cond_resched();
spin_lock(&class->lock);
}
@@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(pool, class, src_page);
spin_unlock(&class->lock);
-
- return nr_total_migrated;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
- unsigned long nr_migrated = 0;
struct size_class *class;
for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
continue;
if (class->index != i)
continue;
- nr_migrated += __zs_compact(pool, class);
+ __zs_compact(pool, class);
}
- return nr_migrated;
+ return pool->stats.pages_compacted;
}
EXPORT_SYMBOL_GPL(zs_compact);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+ memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_freed;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ pages_freed = pool->stats.pages_compacted;
+ /*
+ * Compact classes and calculate compaction delta.
+ * Can run concurrently with a manually triggered
+ * (by user) compaction.
+ */
+ pages_freed = zs_compact(pool) - pages_freed;
+
+ return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_to_free = 0;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ if (!pool->shrinker_enabled)
+ return 0;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+
+ pages_to_free += zs_can_compact(class);
+ }
+
+ return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+ if (pool->shrinker_enabled) {
+ unregister_shrinker(&pool->shrinker);
+ pool->shrinker_enabled = false;
+ }
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+ pool->shrinker.scan_objects = zs_shrinker_scan;
+ pool->shrinker.count_objects = zs_shrinker_count;
+ pool->shrinker.batch = 0;
+ pool->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&pool->shrinker);
+}
+
/**
* zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (zs_pool_stat_create(name, pool))
goto err;
+ /*
+ * Not critical, we still can use the pool
+ * and user can trigger compaction manually.
+ */
+ if (zs_register_shrinker(pool) == 0)
+ pool->shrinker_enabled = true;
return pool;
err:
@@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_unregister_shrinker(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..b198081c2eed 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
static bool zswap_enabled;
module_param_named(enabled, zswap_enabled, bool, 0644);
-/* Compressor to be used by zswap (fixed at boot for now) */
+/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0444);
-
-/* The maximum percentage of memory that the compressed pool can occupy */
-static unsigned int zswap_max_pool_percent = 20;
-module_param_named(max_pool_percent,
- zswap_max_pool_percent, uint, 0644);
+static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
+static struct kparam_string zswap_compressor_kparam = {
+ .string = zswap_compressor,
+ .maxlen = sizeof(zswap_compressor),
+};
+static int zswap_compressor_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_compressor_param_ops = {
+ .set = zswap_compressor_param_set,
+ .get = param_get_string,
+};
+module_param_cb(compressor, &zswap_compressor_param_ops,
+ &zswap_compressor_kparam, 0644);
-/* Compressed storage to use */
+/* Compressed storage zpool to use */
#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-module_param_named(zpool, zswap_zpool_type, charp, 0444);
+static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
+static struct kparam_string zswap_zpool_kparam = {
+ .string = zswap_zpool_type,
+ .maxlen = sizeof(zswap_zpool_type),
+};
+static int zswap_zpool_param_set(const char *, const struct kernel_param *);
+static struct kernel_param_ops zswap_zpool_param_ops = {
+ .set = zswap_zpool_param_set,
+ .get = param_get_string,
+};
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
-/* zpool is shared by all of zswap backend */
-static struct zpool *zswap_pool;
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
/*********************************
-* compression functions
+* data structures
**********************************/
-/* per-cpu compression transforms */
-static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
-enum comp_op {
- ZSWAP_COMPOP_COMPRESS,
- ZSWAP_COMPOP_DECOMPRESS
+struct zswap_pool {
+ struct zpool *zpool;
+ struct kref kref;
+ struct list_head list;
+ struct rcu_head rcu_head;
+ struct notifier_block notifier;
+ char tfm_name[CRYPTO_MAX_ALG_NAME];
+ struct crypto_comp * __percpu *tfm;
};
-static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
- u8 *dst, unsigned int *dlen)
-{
- struct crypto_comp *tfm;
- int ret;
-
- tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
- switch (op) {
- case ZSWAP_COMPOP_COMPRESS:
- ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
- break;
- case ZSWAP_COMPOP_DECOMPRESS:
- ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
- break;
- default:
- ret = -EINVAL;
- }
-
- put_cpu();
- return ret;
-}
-
-static int __init zswap_comp_init(void)
-{
- if (!crypto_has_comp(zswap_compressor, 0, 0)) {
- pr_info("%s compressor not available\n", zswap_compressor);
- /* fall back to default compressor */
- zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
- if (!crypto_has_comp(zswap_compressor, 0, 0))
- /* can't even load the default compressor */
- return -ENODEV;
- }
- pr_info("using %s compressor\n", zswap_compressor);
-
- /* alloc percpu transforms */
- zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
- if (!zswap_comp_pcpu_tfms)
- return -ENOMEM;
- return 0;
-}
-
-static void __init zswap_comp_exit(void)
-{
- /* free percpu transforms */
- free_percpu(zswap_comp_pcpu_tfms);
-}
-
-/*********************************
-* data structures
-**********************************/
/*
* struct zswap_entry
*
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
* page within zswap.
*
* rbnode - links the entry into red-black tree for the appropriate swap type
+ * offset - the swap offset for the entry. Index into the red-black tree.
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
* concurrent calls to load, invalidate, and writeback. The lock
* for the zswap_tree structure that contains the entry must
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
- * offset - the swap offset for the entry. Index into the red-black tree.
- * handle - zpool allocation handle that stores the compressed page data
* length - the length in bytes of the compressed page data. Needed during
* decompression
+ * pool - the zswap_pool the entry's data is in
+ * handle - zpool allocation handle that stores the compressed page data
*/
struct zswap_entry {
struct rb_node rbnode;
pgoff_t offset;
int refcount;
unsigned int length;
+ struct zswap_pool *pool;
unsigned long handle;
};
@@ -201,6 +172,51 @@ struct zswap_tree {
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+/* RCU-protected iteration */
+static LIST_HEAD(zswap_pools);
+/* protects zswap_pools list modification */
+static DEFINE_SPINLOCK(zswap_pools_lock);
+
+/* used by param callback function */
+static bool zswap_init_started;
+
+/*********************************
+* helpers and fwd declarations
+**********************************/
+
+#define zswap_pool_debug(msg, p) \
+ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
+ zpool_get_type((p)->zpool))
+
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_pool_get(struct zswap_pool *pool);
+static void zswap_pool_put(struct zswap_pool *pool);
+
+static const struct zpool_ops zswap_zpool_ops = {
+ .evict = zswap_writeback_entry
+};
+
+static bool zswap_is_full(void)
+{
+ return totalram_pages * zswap_max_pool_percent / 100 <
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static void zswap_update_total_size(void)
+{
+ struct zswap_pool *pool;
+ u64 total = 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ total += zpool_get_total_size(pool->zpool);
+
+ rcu_read_unlock();
+
+ zswap_pool_total_size = total;
+}
+
/*********************************
* zswap entry functions
**********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
- zpool_free(zswap_pool, entry->handle);
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+ zswap_update_total_size();
}
/* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
**********************************/
static DEFINE_PER_CPU(u8 *, zswap_dstmem);
-static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
{
- struct crypto_comp *tfm;
u8 *dst;
switch (action) {
case CPU_UP_PREPARE:
- tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
- if (IS_ERR(tfm)) {
- pr_err("can't allocate compressor transform\n");
- return NOTIFY_BAD;
- }
- *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
if (!dst) {
pr_err("can't allocate compressor buffer\n");
- crypto_free_comp(tfm);
- *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
return NOTIFY_BAD;
}
per_cpu(zswap_dstmem, cpu) = dst;
break;
case CPU_DEAD:
case CPU_UP_CANCELED:
- tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
- if (tfm) {
- crypto_free_comp(tfm);
- *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
- }
dst = per_cpu(zswap_dstmem, cpu);
kfree(dst);
per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
return NOTIFY_OK;
}
-static int zswap_cpu_notifier(struct notifier_block *nb,
- unsigned long action, void *pcpu)
+static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
+ unsigned long action, void *pcpu)
{
- unsigned long cpu = (unsigned long)pcpu;
- return __zswap_cpu_notifier(action, cpu);
+ return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
}
-static struct notifier_block zswap_cpu_notifier_block = {
- .notifier_call = zswap_cpu_notifier
+static struct notifier_block zswap_dstmem_notifier = {
+ .notifier_call = zswap_cpu_dstmem_notifier,
};
-static int __init zswap_cpu_init(void)
+static int __init zswap_cpu_dstmem_init(void)
{
unsigned long cpu;
cpu_notifier_register_begin();
for_each_online_cpu(cpu)
- if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+ if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
+ NOTIFY_BAD)
goto cleanup;
- __register_cpu_notifier(&zswap_cpu_notifier_block);
+ __register_cpu_notifier(&zswap_dstmem_notifier);
cpu_notifier_register_done();
return 0;
cleanup:
for_each_online_cpu(cpu)
- __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+ __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
cpu_notifier_register_done();
return -ENOMEM;
}
+static void zswap_cpu_dstmem_destroy(void)
+{
+ unsigned long cpu;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+ __unregister_cpu_notifier(&zswap_dstmem_notifier);
+ cpu_notifier_register_done();
+}
+
+static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
+ unsigned long action, unsigned long cpu)
+{
+ struct crypto_comp *tfm;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+ break;
+ tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+ if (IS_ERR_OR_NULL(tfm)) {
+ pr_err("could not alloc crypto comp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(tfm));
+ return NOTIFY_BAD;
+ }
+ *per_cpu_ptr(pool->tfm, cpu) = tfm;
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ tfm = *per_cpu_ptr(pool->tfm, cpu);
+ if (!IS_ERR_OR_NULL(tfm))
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(pool->tfm, cpu) = NULL;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int zswap_cpu_comp_notifier(struct notifier_block *nb,
+ unsigned long action, void *pcpu)
+{
+ unsigned long cpu = (unsigned long)pcpu;
+ struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
+
+ return __zswap_cpu_comp_notifier(pool, action, cpu);
+}
+
+static int zswap_cpu_comp_init(struct zswap_pool *pool)
+{
+ unsigned long cpu;
+
+ memset(&pool->notifier, 0, sizeof(pool->notifier));
+ pool->notifier.notifier_call = zswap_cpu_comp_notifier;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
+ NOTIFY_BAD)
+ goto cleanup;
+ __register_cpu_notifier(&pool->notifier);
+ cpu_notifier_register_done();
+ return 0;
+
+cleanup:
+ for_each_online_cpu(cpu)
+ __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+ cpu_notifier_register_done();
+ return -ENOMEM;
+}
+
+static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+{
+ unsigned long cpu;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+ __unregister_cpu_notifier(&pool->notifier);
+ cpu_notifier_register_done();
+}
+
/*********************************
-* helpers
+* pool functions
**********************************/
-static bool zswap_is_full(void)
+
+static struct zswap_pool *__zswap_pool_current(void)
{
- return totalram_pages * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+ struct zswap_pool *pool;
+
+ pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+ WARN_ON(!pool);
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+ assert_spin_locked(&zswap_pools_lock);
+
+ return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+ struct zswap_pool *pool;
+
+ rcu_read_lock();
+
+ pool = __zswap_pool_current();
+ if (!pool || !zswap_pool_get(pool))
+ pool = NULL;
+
+ rcu_read_unlock();
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+ struct zswap_pool *pool, *last = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ last = pool;
+ if (!WARN_ON(!last) && !zswap_pool_get(last))
+ last = NULL;
+
+ rcu_read_unlock();
+
+ return last;
+}
+
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+
+ assert_spin_locked(&zswap_pools_lock);
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+ continue;
+ if (strncmp(zpool_get_type(pool->zpool), type,
+ sizeof(zswap_zpool_type)))
+ continue;
+ /* if we can't get it, it's about to be destroyed */
+ if (!zswap_pool_get(pool))
+ continue;
+ return pool;
+ }
+
+ return NULL;
+}
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool) {
+ pr_err("pool alloc failed\n");
+ return NULL;
+ }
+
+ pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
+ if (!pool->zpool) {
+ pr_err("%s zpool not available\n", type);
+ goto error;
+ }
+ pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+
+ strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+ pool->tfm = alloc_percpu(struct crypto_comp *);
+ if (!pool->tfm) {
+ pr_err("percpu alloc failed\n");
+ goto error;
+ }
+
+ if (zswap_cpu_comp_init(pool))
+ goto error;
+ pr_debug("using %s compressor\n", pool->tfm_name);
+
+ /* being the current pool takes 1 ref; this func expects the
+ * caller to always add the new pool as the current pool
+ */
+ kref_init(&pool->kref);
+ INIT_LIST_HEAD(&pool->list);
+
+ zswap_pool_debug("created", pool);
+
+ return pool;
+
+error:
+ free_percpu(pool->tfm);
+ if (pool->zpool)
+ zpool_destroy_pool(pool->zpool);
+ kfree(pool);
+ return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+ if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+ pr_err("compressor %s not available, using default %s\n",
+ zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+ strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
+ sizeof(zswap_compressor));
+ }
+ if (!zpool_has_pool(zswap_zpool_type)) {
+ pr_err("zpool %s not available, using default %s\n",
+ zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+ strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
+ sizeof(zswap_zpool_type));
+ }
+
+ return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+ zswap_pool_debug("destroying", pool);
+
+ zswap_cpu_comp_destroy(pool);
+ free_percpu(pool->tfm);
+ zpool_destroy_pool(pool->zpool);
+ kfree(pool);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+ return kref_get_unless_zero(&pool->kref);
+}
+
+static void __zswap_pool_release(struct rcu_head *head)
+{
+ struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+
+ /* nobody should have been able to get a kref... */
+ WARN_ON(kref_get_unless_zero(&pool->kref));
+
+ /* pool is now off zswap_pools list and has no references. */
+ zswap_pool_destroy(pool);
+}
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+ struct zswap_pool *pool;
+
+ pool = container_of(kref, typeof(*pool), kref);
+
+ spin_lock(&zswap_pools_lock);
+
+ WARN_ON(pool == zswap_pool_current());
+
+ list_del_rcu(&pool->list);
+ call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+ spin_unlock(&zswap_pools_lock);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+ kref_put(&pool->kref, __zswap_pool_empty);
+}
+
+/*********************************
+* param callbacks
+**********************************/
+
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+ char *type, char *compressor)
+{
+ struct zswap_pool *pool, *put_pool = NULL;
+ char str[kp->str->maxlen], *s;
+ int ret;
+
+ /*
+ * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
+ * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
+ * 32 (arbitrary).
+ */
+ strlcpy(str, val, kp->str->maxlen);
+ s = strim(str);
+
+ /* if this is load-time (pre-init) param setting,
+ * don't create a pool; that's done during init.
+ */
+ if (!zswap_init_started)
+ return param_set_copystring(s, kp);
+
+ /* no change required */
+ if (!strncmp(kp->str->string, s, kp->str->maxlen))
+ return 0;
+
+ if (!type) {
+ type = s;
+ if (!zpool_has_pool(type)) {
+ pr_err("zpool %s not available\n", type);
+ return -ENOENT;
+ }
+ } else if (!compressor) {
+ compressor = s;
+ if (!crypto_has_comp(compressor, 0, 0)) {
+ pr_err("compressor %s not available\n", compressor);
+ return -ENOENT;
+ }
+ }
+
+ spin_lock(&zswap_pools_lock);
+
+ pool = zswap_pool_find_get(type, compressor);
+ if (pool) {
+ zswap_pool_debug("using existing", pool);
+ list_del_rcu(&pool->list);
+ } else {
+ spin_unlock(&zswap_pools_lock);
+ pool = zswap_pool_create(type, compressor);
+ spin_lock(&zswap_pools_lock);
+ }
+
+ if (pool)
+ ret = param_set_copystring(s, kp);
+ else
+ ret = -EINVAL;
+
+ if (!ret) {
+ put_pool = zswap_pool_current();
+ list_add_rcu(&pool->list, &zswap_pools);
+ } else if (pool) {
+ /* add the possibly pre-existing pool to the end of the pools
+ * list; if it's new (and empty) then it'll be removed and
+ * destroyed by the put after we drop the lock
+ */
+ list_add_tail_rcu(&pool->list, &zswap_pools);
+ put_pool = pool;
+ }
+
+ spin_unlock(&zswap_pools_lock);
+
+ /* drop the ref from either the old current pool,
+ * or the new pool we failed to add
+ */
+ if (put_pool)
+ zswap_pool_put(put_pool);
+
+ return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ return __zswap_param_set(val, kp, NULL, zswap_compressor);
}
/*********************************
@@ -446,75 +804,14 @@ enum zswap_get_swap_ret {
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
- int err;
-
- *retpage = NULL;
- do {
- /*
- * First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics.
- */
- found_page = find_get_page(swapper_space, entry.val);
- if (found_page)
- break;
-
- /*
- * Get a new page to read into from swap.
- */
- if (!new_page) {
- new_page = alloc_page(GFP_KERNEL);
- if (!new_page)
- break; /* Out of memory */
- }
-
- /*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_preload(GFP_KERNEL);
- if (err)
- break;
-
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
- radix_tree_preload_end();
- continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
- break;
- }
+ bool page_was_allocated;
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
- SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
- if (likely(!err)) {
- radix_tree_preload_end();
- lru_cache_add_anon(new_page);
- *retpage = new_page;
- return ZSWAP_SWAPCACHE_NEW;
- }
- radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- swapcache_free(entry);
- } while (err != -ENOMEM);
-
- if (new_page)
- page_cache_release(new_page);
- if (!found_page)
+ *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+ NULL, 0, &page_was_allocated);
+ if (page_was_allocated)
+ return ZSWAP_SWAPCACHE_NEW;
+ if (!*retpage)
return ZSWAP_SWAPCACHE_FAIL;
- *retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -538,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
pgoff_t offset;
struct zswap_entry *entry;
struct page *page;
+ struct crypto_comp *tfm;
u8 *src, *dst;
unsigned int dlen;
int ret;
@@ -578,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+ src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
ZPOOL_MM_RO) + sizeof(struct zswap_header);
dst = kmap_atomic(page);
- ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
- entry->length, dst, &dlen);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
+ ret = crypto_comp_decompress(tfm, src, entry->length,
+ dst, &dlen);
+ put_cpu_ptr(entry->pool->tfm);
kunmap_atomic(dst);
- zpool_unmap_handle(zswap_pool, entry->handle);
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -633,6 +933,22 @@ end:
return ret;
}
+static int zswap_shrink(void)
+{
+ struct zswap_pool *pool;
+ int ret;
+
+ pool = zswap_pool_last_get();
+ if (!pool)
+ return -ENOENT;
+
+ ret = zpool_shrink(pool->zpool, 1, NULL);
+
+ zswap_pool_put(pool);
+
+ return ret;
+}
+
/*********************************
* frontswap hooks
**********************************/
@@ -642,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *dupentry;
+ struct crypto_comp *tfm;
int ret;
unsigned int dlen = PAGE_SIZE, len;
unsigned long handle;
@@ -657,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
- if (zpool_shrink(zswap_pool, 1, NULL)) {
+ if (zswap_shrink()) {
zswap_reject_reclaim_fail++;
ret = -ENOMEM;
goto reject;
@@ -672,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ /* if entry is successfully added, it keeps the reference */
+ entry->pool = zswap_pool_current_get();
+ if (!entry->pool) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
/* compress */
dst = get_cpu_var(zswap_dstmem);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
src = kmap_atomic(page);
- ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+ ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
kunmap_atomic(src);
+ put_cpu_ptr(entry->pool->tfm);
if (ret) {
ret = -EINVAL;
- goto freepage;
+ goto put_dstmem;
}
/* store */
len = dlen + sizeof(struct zswap_header);
- ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
- &handle);
+ ret = zpool_malloc(entry->pool->zpool, len,
+ __GFP_NORETRY | __GFP_NOWARN, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
- goto freepage;
+ goto put_dstmem;
}
if (ret) {
zswap_reject_alloc_fail++;
- goto freepage;
+ goto put_dstmem;
}
- zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
+ zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
zhdr->swpentry = swp_entry(type, offset);
buf = (u8 *)(zhdr + 1);
memcpy(buf, dst, dlen);
- zpool_unmap_handle(zswap_pool, handle);
+ zpool_unmap_handle(entry->pool->zpool, handle);
put_cpu_var(zswap_dstmem);
/* populate entry */
@@ -721,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+ zswap_update_total_size();
return 0;
-freepage:
+put_dstmem:
put_cpu_var(zswap_dstmem);
+ zswap_pool_put(entry->pool);
+freepage:
zswap_entry_cache_free(entry);
reject:
return ret;
@@ -741,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
+ struct crypto_comp *tfm;
u8 *src, *dst;
unsigned int dlen;
int ret;
@@ -757,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+ src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
ZPOOL_MM_RO) + sizeof(struct zswap_header);
dst = kmap_atomic(page);
- ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
- dst, &dlen);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
+ ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+ put_cpu_ptr(entry->pool->tfm);
kunmap_atomic(dst);
- zpool_unmap_handle(zswap_pool, entry->handle);
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
spin_lock(&tree->lock);
@@ -816,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_trees[type] = NULL;
}
-static struct zpool_ops zswap_zpool_ops = {
- .evict = zswap_writeback_entry
-};
-
static void zswap_frontswap_init(unsigned type)
{
struct zswap_tree *tree;
@@ -900,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
**********************************/
static int __init init_zswap(void)
{
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+ struct zswap_pool *pool;
- pr_info("loading zswap\n");
-
- zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
- &zswap_zpool_ops);
- if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
- pr_info("%s zpool not available\n", zswap_zpool_type);
- zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
- &zswap_zpool_ops);
- }
- if (!zswap_pool) {
- pr_err("%s zpool not available\n", zswap_zpool_type);
- pr_err("zpool creation failed\n");
- goto error;
- }
- pr_info("using %s pool\n", zswap_zpool_type);
+ zswap_init_started = true;
if (zswap_entry_cache_create()) {
pr_err("entry cache creation failed\n");
- goto cachefail;
+ goto cache_fail;
}
- if (zswap_comp_init()) {
- pr_err("compressor initialization failed\n");
- goto compfail;
+
+ if (zswap_cpu_dstmem_init()) {
+ pr_err("dstmem alloc failed\n");
+ goto dstmem_fail;
}
- if (zswap_cpu_init()) {
- pr_err("per-cpu initialization failed\n");
- goto pcpufail;
+
+ pool = __zswap_pool_create_fallback();
+ if (!pool) {
+ pr_err("pool creation failed\n");
+ goto pool_fail;
}
+ pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+ zpool_get_type(pool->zpool));
+
+ list_add(&pool->list, &zswap_pools);
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
-pcpufail:
- zswap_comp_exit();
-compfail:
+
+pool_fail:
+ zswap_cpu_dstmem_destroy();
+dstmem_fail:
zswap_entry_cache_destroy();
-cachefail:
- zpool_destroy_pool(zswap_pool);
-error:
+cache_fail:
return -ENOMEM;
}
/* must be late so crypto has time to come up */
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..69a4d30a9ccf 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
struct ceph_options *opt = client->options;
size_t pos = m->count;
- if (opt->name)
- seq_printf(m, "name=%s,", opt->name);
+ if (opt->name) {
+ seq_puts(m, "name=");
+ seq_escape(m, opt->name, ", \t\n\\");
+ seq_putc(m, ',');
+ }
if (opt->key)
seq_puts(m, "secret=<hidden>,");
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
ret = atomic_dec_and_test(&task->tk_count);
if (waitqueue_active(wq))
- __wake_up_locked_key(wq, TASK_NORMAL, &k);
+ __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
spin_unlock_irqrestore(&wq->lock, flags);
return ret;
}
diff --git a/scripts/Lindent b/scripts/Lindent
index 9c4b3e2b7098..6d889de4e70b 100755
--- a/scripts/Lindent
+++ b/scripts/Lindent
@@ -1,6 +1,9 @@
#!/bin/sh
PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
RES=`indent --version`
+if [ "$RES" = "" ]; then
+ exit 1
+fi
V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a51ca0e5beef..e14dcdbef7dd 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -264,6 +264,7 @@ our $Sparse = qr{
__kernel|
__force|
__iomem|
+ __pmem|
__must_check|
__init_refok|
__kprobes|
@@ -584,7 +585,7 @@ our $LvalOrFunc = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
our $declaration_macros = qr{(?x:
- (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(|
+ (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
(?:$Storage\s+)?LIST_HEAD\s*\(|
(?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
)};
@@ -1953,9 +1954,9 @@ sub process {
our $clean = 1;
my $signoff = 0;
my $is_patch = 0;
-
my $in_header_lines = $file ? 0 : 1;
my $in_commit_log = 0; #Scanning lines before patch
+ my $commit_log_possible_stack_dump = 0;
my $commit_log_long_line = 0;
my $commit_log_has_diff = 0;
my $reported_maintainer_file = 0;
@@ -2166,11 +2167,15 @@ sub process {
if ($showfile) {
$prefix = "$realfile:$realline: "
} elsif ($emacs) {
- $prefix = "$filename:$linenr: ";
+ if ($file) {
+ $prefix = "$filename:$realline: ";
+ } else {
+ $prefix = "$filename:$linenr: ";
+ }
}
if ($found_file) {
- if ($realfile =~ m@^(drivers/net/|net/)@) {
+ if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
$check = 1;
} else {
$check = $check_orig;
@@ -2310,16 +2315,42 @@ sub process {
# Check for line lengths > 75 in commit log, warn once
if ($in_commit_log && !$commit_log_long_line &&
- length($line) > 75) {
+ length($line) > 75 &&
+ !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+ # file delta changes
+ $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+ # filename then :
+ $line =~ /^\s*(?:Fixes:|Link:)/i ||
+ # A Fixes: or Link: line
+ $commit_log_possible_stack_dump)) {
WARN("COMMIT_LOG_LONG_LINE",
"Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
$commit_log_long_line = 1;
}
+# Check if the commit log is in a possible stack dump
+ if ($in_commit_log && !$commit_log_possible_stack_dump &&
+ ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+ $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+ # timestamp
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+ # stack dump address
+ $commit_log_possible_stack_dump = 1;
+ }
+
+# Reset possible stack dump if a blank line is found
+ if ($in_commit_log && $commit_log_possible_stack_dump &&
+ $line =~ /^\s*$/) {
+ $commit_log_possible_stack_dump = 0;
+ }
+
# Check for git id commit length and improperly formed commit descriptions
- if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) {
- my $init_char = $1;
- my $orig_commit = lc($2);
+ if ($in_commit_log &&
+ ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
+ ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+ $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+ my $init_char = "c";
+ my $orig_commit = "";
my $short = 1;
my $long = 0;
my $case = 1;
@@ -2330,6 +2361,13 @@ sub process {
my $orig_desc = "commit description";
my $description = "";
+ if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
+ $init_char = $1;
+ $orig_commit = lc($2);
+ } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
+ $orig_commit = lc($1);
+ }
+
$short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
$long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
$space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2776,8 @@ sub process {
}
}
+# Block comment styles
+# Networking with an initial /*
if ($realfile =~ m@^(drivers/net/|net/)@ &&
$prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
$rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2786,23 @@ sub process {
"networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
}
- if ($realfile =~ m@^(drivers/net/|net/)@ &&
- $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /*
+# Block comments use * on subsequent lines
+ if ($prevline =~ /$;[ \t]*$/ && #ends in comment
+ $prevrawline =~ /^\+.*?\/\*/ && #starting /*
$prevrawline !~ /\*\/[ \t]*$/ && #no trailing */
$rawline =~ /^\+/ && #line is new
$rawline !~ /^\+[ \t]*\*/) { #no leading *
- WARN("NETWORKING_BLOCK_COMMENT_STYLE",
- "networking block comments start with * on subsequent lines\n" . $hereprev);
+ WARN("BLOCK_COMMENT_STYLE",
+ "Block comments use * on subsequent lines\n" . $hereprev);
}
- if ($realfile =~ m@^(drivers/net/|net/)@ &&
- $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */
+# Block comments use */ on trailing lines
+ if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ && #trailing */
$rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ && #inline /*...*/
$rawline !~ m@^\+.*\*{2,}/[ \t]*$@ && #trailing **/
$rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) { #non blank */
- WARN("NETWORKING_BLOCK_COMMENT_STYLE",
- "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+ WARN("BLOCK_COMMENT_STYLE",
+ "Block comments use a trailing */ on a separate line\n" . $herecurr);
}
# check for missing blank lines after struct/union declarations
@@ -3067,15 +3108,22 @@ sub process {
substr($s, 0, length($c), '');
- # Make sure we remove the line prefixes as we have
- # none on the first line, and are going to readd them
- # where necessary.
- $s =~ s/\n./\n/gs;
+ # remove inline comments
+ $s =~ s/$;/ /g;
+ $c =~ s/$;/ /g;
# Find out how long the conditional actually is.
my @newlines = ($c =~ /\n/gs);
my $cond_lines = 1 + $#newlines;
+ # Make sure we remove the line prefixes as we have
+ # none on the first line, and are going to readd them
+ # where necessary.
+ $s =~ s/\n./\n/gs;
+ while ($s =~ /\n\s+\\\n/) {
+ $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
+ }
+
# We want to check the first line inside the block
# starting at the end of the conditional, so remove:
# 1) any blank line termination
@@ -3141,8 +3189,10 @@ sub process {
#print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
- if ($check && (($sindent % 8) != 0 ||
- ($sindent <= $indent && $s ne ''))) {
+ if ($check && $s ne '' &&
+ (($sindent % 8) != 0 ||
+ ($sindent < $indent) ||
+ ($sindent > $indent + 8))) {
WARN("SUSPECT_CODE_INDENT",
"suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
}
@@ -3439,13 +3489,15 @@ sub process {
}
}
-# # no BUG() or BUG_ON()
-# if ($line =~ /\b(BUG|BUG_ON)\b/) {
-# print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n";
-# print "$herecurr";
-# $clean = 0;
-# }
+# avoid BUG() or BUG_ON()
+ if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+ my $msg_type = \&WARN;
+ $msg_type = \&CHK if ($file);
+ &{$msg_type}("AVOID_BUG",
+ "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+ }
+# avoid LINUX_VERSION_CODE
if ($line =~ /\bLINUX_VERSION_CODE\b/) {
WARN("LINUX_VERSION_CODE",
"LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -3520,7 +3572,7 @@ sub process {
# function brace can't be on same line, except for #defines of do while,
# or if closed on same line
if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and
- !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) {
+ !($line=~/\#\s*define.*do\s\{/) and !($line=~/}/)) {
if (ERROR("OPEN_BRACE",
"open brace '{' following function declarations go on the next line\n" . $herecurr) &&
$fix) {
@@ -4032,8 +4084,8 @@ sub process {
## }
#need space before brace following if, while, etc
- if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) ||
- $line =~ /do{/) {
+ if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
+ $line =~ /do\{/) {
if (ERROR("SPACING",
"space required before the open brace '{'\n" . $herecurr) &&
$fix) {
@@ -4480,7 +4532,7 @@ sub process {
$dstat !~ /^for\s*$Constant$/ && # for (...)
$dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar()
$dstat !~ /^do\s*{/ && # do {...
- $dstat !~ /^\({/ && # ({...
+ $dstat !~ /^\(\{/ && # ({...
$ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
{
$ctx =~ s/\n*$//;
@@ -4789,16 +4841,20 @@ sub process {
"Consecutive strings are generally better as a single string\n" . $herecurr);
}
-# check for %L{u,d,i} in strings
+# check for %L{u,d,i} and 0x%[udi] in strings
my $string;
while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
$string = substr($rawline, $-[1], $+[1] - $-[1]);
$string =~ s/%%/__/g;
- if ($string =~ /(?<!%)%L[udi]/) {
+ if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
WARN("PRINTF_L",
"\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
last;
}
+ if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
+ ERROR("PRINTF_0xDECIMAL",
+ "Prefixing 0x with decimal output is defective\n" . $herecurr);
+ }
}
# check for line continuations in quoted strings with odd counts of "
@@ -4816,10 +4872,34 @@ sub process {
# check for needless "if (<foo>) fn(<foo>)" uses
if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
- my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
- if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
- WARN('NEEDLESS_IF',
- "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
+ my $tested = quotemeta($1);
+ my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
+ if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
+ my $func = $1;
+ if (WARN('NEEDLESS_IF',
+ "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
+ $fix) {
+ my $do_fix = 1;
+ my $leading_tabs = "";
+ my $new_leading_tabs = "";
+ if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
+ $leading_tabs = $1;
+ } else {
+ $do_fix = 0;
+ }
+ if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
+ $new_leading_tabs = $1;
+ if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
+ $do_fix = 0;
+ }
+ } else {
+ $do_fix = 0;
+ }
+ if ($do_fix) {
+ fix_delete_line($fixlinenr - 1, $prevrawline);
+ $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
+ }
+ }
}
}
@@ -5517,10 +5597,10 @@ sub process {
"consider using a completion\n" . $herecurr);
}
-# recommend kstrto* over simple_strto* and strict_strto*
- if ($line =~ /\b((simple|strict)_(strto(l|ll|ul|ull)))\s*\(/) {
+# simple_strto*() is deprecated
+ if ($line =~ /\b(simple_strto(l|ll|ul|ull))\s*\(/) {
WARN("CONSIDER_KSTRTO",
- "$1 is obsolete, use k$3 instead\n" . $herecurr);
+ "$1 is obsolete, use parse_integer(), kstrto*(), kstrto*_from_user(), sscanf() instead\n" . $herecurr);
}
# check for __initcall(), use device_initcall() explicitly or more appropriate function please
diff --git a/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci b/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
new file mode 100644
index 000000000000..9b7eb321a025
--- /dev/null
+++ b/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
@@ -0,0 +1,84 @@
+///
+/// Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0
+///
+// Copyright: (C) 2015 Intel Corp. GPLv2.
+// Options: --no-includes --include-headers
+//
+// Keywords: dma_pool_zalloc, pci_pool_zalloc
+//
+
+virtual context
+virtual patch
+virtual org
+virtual report
+
+//----------------------------------------------------------
+// For context mode
+//----------------------------------------------------------
+
+@depends on context@
+expression x;
+statement S;
+@@
+
+* x = \(dma_pool_alloc\|pci_pool_alloc\)(...);
+ if ((x==NULL) || ...) S
+* memset(x,0, ...);
+
+//----------------------------------------------------------
+// For patch mode
+//----------------------------------------------------------
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = dma_pool_alloc(a,b,c);
++ x = dma_pool_zalloc(a,b,c);
+ if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = pci_pool_alloc(a,b,c);
++ x = pci_pool_zalloc(a,b,c);
+ if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+//----------------------------------------------------------
+// For org and report mode
+//----------------------------------------------------------
+
+@r depends on org || report@
+expression x;
+expression a,b,c;
+statement S;
+position p;
+@@
+
+ x = @p\(dma_pool_alloc\|pci_pool_alloc\)(a,b,c);
+ if ((x==NULL) || ...) S
+ memset(x,0, ...);
+
+@script:python depends on org@
+p << r.p;
+x << r.x;
+@@
+
+msg="%s" % (x)
+msg_safe=msg.replace("[","@(").replace("]",")")
+coccilib.org.print_todo(p[0], msg_safe)
+
+@script:python depends on report@
+p << r.p;
+x << r.x;
+@@
+
+msg="WARNING: *_pool_zalloc should be used for %s, instead of *_pool_alloc/memset" % (x)
+coccilib.report.print_report(p[0], msg)
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 515c4c00e957..00d6d53c2681 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -14,11 +14,14 @@ declare -A cache
parse_symbol() {
# The structure of symbol at this point is:
- # [name]+[offset]/[total length]
+ # ([name]+[offset]/[total length])
#
# For example:
# do_basic_setup+0x9c/0xbf
+ # Remove the englobing parenthesis
+ symbol=${symbol#\(}
+ symbol=${symbol%\)}
# Strip the symbol name so that we could look it up
local name=${symbol%+*}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 3a4d895b9237..ab17c58a57b4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -469,7 +469,7 @@ sub dump_section {
} else {
# print STDERR "other section '$name' = '$contents'\n";
if (defined($sections{$name}) && ($sections{$name} ne "")) {
- print STDERR "Error(${file}:$.): duplicate section name '$name'\n";
+ print STDERR "${file}:$.: error: duplicate section name '$name'\n";
++$errors;
}
$sections{$name} = $contents;
@@ -1818,7 +1818,7 @@ sub dump_struct($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse struct or union!\n";
+ print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
++$errors;
}
}
@@ -1839,7 +1839,7 @@ sub dump_enum($$) {
push @parameterlist, $arg;
if (!$parameterdescs{$arg}) {
$parameterdescs{$arg} = $undescribed;
- print STDERR "Warning(${file}:$.): Enum value '$arg' ".
+ print STDERR "${file}:$.: warning: Enum value '$arg' ".
"not described in enum '$declaration_name'\n";
}
@@ -1857,7 +1857,7 @@ sub dump_enum($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse enum!\n";
+ print STDERR "${file}:$.: error: Cannot parse enum!\n";
++$errors;
}
}
@@ -1885,7 +1885,7 @@ sub dump_typedef($$) {
});
}
else {
- print STDERR "Error(${file}:$.): Cannot parse typedef!\n";
+ print STDERR "${file}:$.: error: Cannot parse typedef!\n";
++$errors;
}
}
@@ -2017,11 +2017,11 @@ sub push_parameter($$$) {
$parameterdescs{$param_name} = $undescribed;
if (($type eq 'function') || ($type eq 'enum')) {
- print STDERR "Warning(${file}:$.): Function parameter ".
+ print STDERR "${file}:$.: warning: Function parameter ".
"or member '$param' not " .
"described in '$declaration_name'\n";
}
- print STDERR "Warning(${file}:$.):" .
+ print STDERR "${file}:$.: warning:" .
" No description found for parameter '$param'\n";
++$warnings;
}
@@ -2072,14 +2072,14 @@ sub check_sections($$$$$$) {
}
if ($err) {
if ($decl_type eq "function") {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"Excess function parameter " .
"'$sects[$sx]' " .
"description in '$decl_name'\n";
++$warnings;
} else {
if ($nested !~ m/\Q$sects[$sx]\E/) {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"Excess struct/union/enum/typedef member " .
"'$sects[$sx]' " .
"description in '$decl_name'\n";
@@ -2105,7 +2105,7 @@ sub check_return_section {
if (!defined($sections{$section_return}) ||
$sections{$section_return} eq "") {
- print STDERR "Warning(${file}:$.): " .
+ print STDERR "${file}:$.: warning: " .
"No description found for return value of " .
"'$declaration_name'\n";
++$warnings;
@@ -2184,7 +2184,7 @@ sub dump_function($$) {
create_parameterlist($args, ',', $file);
} else {
- print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n";
+ print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
return;
}
@@ -2249,7 +2249,7 @@ sub tracepoint_munge($) {
$tracepointargs = $1;
}
if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
- print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+ print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
"$prototype\n";
} else {
$prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2448,7 +2448,7 @@ sub process_file($) {
}
if (($declaration_purpose eq "") && $verbose) {
- print STDERR "Warning(${file}:$.): missing initial short description on line:\n";
+ print STDERR "${file}:$.: warning: missing initial short description on line:\n";
print STDERR $_;
++$warnings;
}
@@ -2466,10 +2466,10 @@ sub process_file($) {
}
if ($verbose) {
- print STDERR "Info(${file}:$.): Scanning doc for $identifier\n";
+ print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
}
} else {
- print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.",
+ print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
" - I thought it was a doc line\n";
++$warnings;
$state = 0;
@@ -2481,7 +2481,7 @@ sub process_file($) {
if (($contents ne "") && ($contents ne "\n")) {
if (!$in_doc_sect && $verbose) {
- print STDERR "Warning(${file}:$.): contents before sections\n";
+ print STDERR "${file}:$.: warning: contents before sections\n";
++$warnings;
}
dump_section($file, $section, xml_escape($contents));
@@ -2507,7 +2507,7 @@ sub process_file($) {
}
# look for doc_com + <text> + doc_end:
if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
- print STDERR "Warning(${file}:$.): suspicious ending line: $_";
+ print STDERR "${file}:$.: warning: suspicious ending line: $_";
++$warnings;
}
@@ -2537,7 +2537,7 @@ sub process_file($) {
}
} else {
# i dont know - bad line? ignore.
- print STDERR "Warning(${file}:$.): bad line: $_";
+ print STDERR "${file}:$.: warning: bad line: $_";
++$warnings;
}
} elsif ($state == 5) { # scanning for split parameters
@@ -2629,7 +2629,7 @@ sub process_file($) {
}
}
if ($initial_section_counter == $section_counter) {
- print STDERR "Warning(${file}): no structured comments found\n";
+ print STDERR "${file}:1: warning: no structured comments found\n";
if (($function_only == 1) && ($show_not_found == 1)) {
print STDERR " Was looking for '$_'.\n" for keys %function_table;
}
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bb8e4d0a1911..bf30d2c0ec27 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -187,6 +187,7 @@ capatibilities||capabilities
carefuly||carefully
cariage||carriage
catagory||category
+cehck||check
challange||challenge
challanges||challenges
chanell||channel
@@ -199,6 +200,8 @@ charactor||character
charater||character
charaters||characters
charcter||character
+chcek||check
+chck||check
checksuming||checksumming
childern||children
childs||children
@@ -987,6 +990,7 @@ unexpectd||unexpected
unexpeted||unexpected
unfortunatelly||unfortunately
unifiy||unify
+unintialized||uninitialized
unknonw||unknown
unknow||unknown
unkown||unknown
@@ -1028,6 +1032,7 @@ visiters||visitors
vitual||virtual
wating||waiting
whataver||whatever
+whcih||which
whenver||whenever
wheter||whether
whe||when
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
new->cap_effective = *effective;
new->cap_inheritable = *inheritable;
new->cap_permitted = *permitted;
+
+ /*
+ * Mask off ambient bits that are no longer both permitted and
+ * inheritable.
+ */
+ new->cap_ambient = cap_intersect(new->cap_ambient,
+ cap_intersect(*permitted,
+ *inheritable));
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EINVAL;
return 0;
}
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
/*
* pP' = (X & fP) | (pI & fI)
+ * The addition of pA' is handled later.
*/
new->cap_permitted.cap[i] =
(new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
{
const struct cred *old = current_cred();
struct cred *new = bprm->cred;
- bool effective, has_cap = false;
+ bool effective, has_cap = false, is_setid;
int ret;
kuid_t root_uid;
+ if (WARN_ON(!cap_ambient_invariant_ok(old)))
+ return -EPERM;
+
effective = false;
ret = get_file_caps(bprm, &effective, &has_cap);
if (ret < 0)
@@ -522,8 +536,9 @@ skip:
*
* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
- if ((!uid_eq(new->euid, old->uid) ||
- !gid_eq(new->egid, old->gid) ||
+ is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+ if ((is_setid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) &&
bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
/* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
new->suid = new->fsuid = new->euid;
new->sgid = new->fsgid = new->egid;
+ /* File caps or setid cancels ambient. */
+ if (has_cap || is_setid)
+ cap_clear(new->cap_ambient);
+
+ /*
+ * Now that we've computed pA', update pP' to give:
+ * pP' = (X & fP) | (pI & fI) | pA'
+ */
+ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+ /*
+ * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
+ * this is the same as pE' = (fE ? pP' : 0) | pA'.
+ */
if (effective)
new->cap_effective = new->cap_permitted;
else
- cap_clear(new->cap_effective);
+ new->cap_effective = new->cap_ambient;
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
bprm->cap_effective = effective;
/*
@@ -557,7 +590,7 @@ skip:
* Number 1 above might fail if you don't have a full bset, but I think
* that is interesting information to audit.
*/
- if (!cap_isclear(new->cap_effective)) {
+ if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
!uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
}
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
return 0;
}
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
if (!uid_eq(cred->uid, root_uid)) {
if (bprm->cap_effective)
return 1;
- if (!cap_isclear(cred->cap_permitted))
+ if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
return 1;
}
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
uid_eq(old->suid, root_uid)) &&
(!uid_eq(new->uid, root_uid) &&
!uid_eq(new->euid, root_uid) &&
- !uid_eq(new->suid, root_uid)) &&
- !issecure(SECURE_KEEP_CAPS)) {
- cap_clear(new->cap_permitted);
- cap_clear(new->cap_effective);
+ !uid_eq(new->suid, root_uid))) {
+ if (!issecure(SECURE_KEEP_CAPS)) {
+ cap_clear(new->cap_permitted);
+ cap_clear(new->cap_effective);
+ }
+
+ /*
+ * Pre-ambient programs expect setresuid to nonroot followed
+ * by exec to drop capabilities. We should make sure that
+ * this remains the case.
+ */
+ cap_clear(new->cap_ambient);
}
if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
cap_clear(new->cap_effective);
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
return commit_creds(new);
+ case PR_CAP_AMBIENT:
+ if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+ if (arg3 | arg4 | arg5)
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ cap_clear(new->cap_ambient);
+ return commit_creds(new);
+ }
+
+ if (((!cap_valid(arg3)) | arg4 | arg5))
+ return -EINVAL;
+
+ if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+ return !!cap_raised(current_cred()->cap_ambient, arg3);
+ } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+ arg2 != PR_CAP_AMBIENT_LOWER) {
+ return -EINVAL;
+ } else {
+ if (arg2 == PR_CAP_AMBIENT_RAISE &&
+ (!cap_raised(current_cred()->cap_permitted, arg3) ||
+ !cap_raised(current_cred()->cap_inheritable,
+ arg3) ||
+ issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
+ return -EPERM;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ if (arg2 == PR_CAP_AMBIENT_RAISE)
+ cap_raise(new->cap_ambient, arg3);
+ else
+ cap_lower(new->cap_ambient, arg3);
+ return commit_creds(new);
+ }
+
default:
/* No functionality available - continue with default */
return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
new->cap_inheritable = old->cap_inheritable;
new->cap_permitted = old->cap_permitted;
new->cap_effective = old->cap_effective;
+ new->cap_ambient = old->cap_ambient;
new->cap_bset = old->cap_bset;
new->jit_keyring = old->jit_keyring;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 55285054aa73..e4369d86e588 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1111,7 +1111,7 @@ static void selinux_write_opts(struct seq_file *m,
seq_puts(m, prefix);
if (has_comma)
seq_putc(m, '\"');
- seq_puts(m, opts->mnt_opts[i]);
+ seq_escape(m, opts->mnt_opts[i], "\"\n\\");
if (has_comma)
seq_putc(m, '\"');
}
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index a99f7200ff3f..9b90519278c9 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -1180,6 +1180,7 @@ static void snd_mixer_oss_proc_write(struct snd_info_entry *entry,
int ch, idx;
struct snd_mixer_oss_assign_table *tbl;
struct slot *slot;
+ int rv;
while (!snd_info_get_line(buffer, line, sizeof(line))) {
cptr = snd_info_get_str(str, line, sizeof(str));
@@ -1200,9 +1201,9 @@ static void snd_mixer_oss_proc_write(struct snd_info_entry *entry,
continue;
}
snd_info_get_str(idxstr, cptr, sizeof(idxstr));
- idx = simple_strtoul(idxstr, NULL, 10);
- if (idx >= 0x4000) { /* too big */
- pr_err("ALSA: mixer_oss: invalid index %d\n", idx);
+ rv = parse_integer(idxstr, 10, (unsigned int *)&idx);
+ if (rv < 0 || idxstr[rv] || idx >= 0x4000) { /* too big */
+ pr_err("ALSA: mixer_oss: invalid index %s\n", idxstr);
continue;
}
mutex_lock(&mixer->reg_mutex);
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 58550cc93f28..f3ca50424cdc 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2875,6 +2875,7 @@ static void snd_pcm_oss_proc_write(struct snd_info_entry *entry,
const char *ptr;
int idx1;
struct snd_pcm_oss_setup *setup, *setup1, template;
+ int rv;
while (!snd_info_get_line(buffer, line, sizeof(line))) {
mutex_lock(&pstr->oss.setup_mutex);
@@ -2892,9 +2893,17 @@ static void snd_pcm_oss_proc_write(struct snd_info_entry *entry,
}
}
ptr = snd_info_get_str(str, ptr, sizeof(str));
- template.periods = simple_strtoul(str, NULL, 10);
+ rv = parse_integer(str, 10, &template.periods);
+ if (rv < 0 || str[rv]) {
+ mutex_unlock(&pstr->oss.setup_mutex);
+ continue;
+ }
ptr = snd_info_get_str(str, ptr, sizeof(str));
- template.period_size = simple_strtoul(str, NULL, 10);
+ rv = parse_integer(str, 10, &template.period_size);
+ if (rv < 0 || str[rv]) {
+ mutex_unlock(&pstr->oss.setup_mutex);
+ continue;
+ }
for (idx1 = 31; idx1 >= 0; idx1--)
if (template.period_size & (1 << idx1))
break;
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 02bd96954dc4..50aa3e1ec379 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -506,8 +506,17 @@ static void snd_pcm_xrun_debug_write(struct snd_info_entry *entry,
{
struct snd_pcm_str *pstr = entry->private_data;
char line[64];
- if (!snd_info_get_line(buffer, line, sizeof(line)))
- pstr->xrun_debug = simple_strtoul(line, NULL, 10);
+ int rv;
+
+ if (!snd_info_get_line(buffer, line, sizeof(line))) {
+ rv = parse_integer(line, 10, &pstr->xrun_debug);
+ if (rv >= 0 && line[rv])
+ rv = -EINVAL;
+ if (rv < 0) {
+ buffer->error = rv;
+ return;
+ }
+ }
}
#endif
diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c
index b45f6aa32264..c06d59c60095 100644
--- a/sound/core/pcm_memory.c
+++ b/sound/core/pcm_memory.c
@@ -160,6 +160,7 @@ static void snd_pcm_lib_preallocate_proc_write(struct snd_info_entry *entry,
char line[64], str[64];
size_t size;
struct snd_dma_buffer new_dmab;
+ int rv;
if (substream->runtime) {
buffer->error = -EBUSY;
@@ -167,8 +168,14 @@ static void snd_pcm_lib_preallocate_proc_write(struct snd_info_entry *entry,
}
if (!snd_info_get_line(buffer, line, sizeof(line))) {
snd_info_get_str(str, line, sizeof(str));
- size = simple_strtoul(str, NULL, 10) * 1024;
- if ((size != 0 && size < 8192) || size > substream->dma_max) {
+ rv = parse_integer(str, 10, &size);
+ if (rv < 0) {
+ buffer->error = rv;
+ return;
+ }
+ size *= 1024;
+ if (str[rv] ||
+ (size != 0 && size < 8192) || size > substream->dma_max) {
buffer->error = -EINVAL;
return;
}
diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c
index 82259ca61e64..a146049a33cd 100644
--- a/sound/pci/ac97/ac97_codec.c
+++ b/sound/pci/ac97/ac97_codec.c
@@ -2877,6 +2877,8 @@ static int apply_quirk_str(struct snd_ac97 *ac97, const char *typestr)
{
int i;
struct quirk_table *q;
+ unsigned int type;
+ int rv;
for (i = 0; i < ARRAY_SIZE(applicable_quirks); i++) {
q = &applicable_quirks[i];
@@ -2884,9 +2886,10 @@ static int apply_quirk_str(struct snd_ac97 *ac97, const char *typestr)
return apply_quirk(ac97, i);
}
/* for compatibility, accept the numbers, too */
- if (*typestr >= '0' && *typestr <= '9')
- return apply_quirk(ac97, (int)simple_strtoul(typestr, NULL, 10));
- return -EINVAL;
+ rv = parse_integer(typestr, 10, &type);
+ if (rv < 0)
+ return rv;
+ return apply_quirk(ac97, type);
}
/**
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 1b63a03a1f57..c81aec9c872a 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -250,7 +250,7 @@ static ssize_t codec_reg_write_file(struct file *file,
char buf[32];
size_t buf_size;
char *start = buf;
- unsigned long reg, value;
+ unsigned int reg, value;
struct snd_soc_codec *codec = file->private_data;
int ret;
@@ -261,10 +261,13 @@ static ssize_t codec_reg_write_file(struct file *file,
while (*start == ' ')
start++;
- reg = simple_strtoul(start, &start, 16);
+ ret = parse_integer(start, 16, &reg);
+ if (ret < 0)
+ return ret;
+ start += ret;
while (*start == ' ')
start++;
- ret = kstrtoul(start, 16, &value);
+ ret = kstrtouint(start, 16, &value);
if (ret)
return ret;
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..1dacac81bb68 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,17 +1,21 @@
# Makefile for vm selftests
-CFLAGS = -Wall
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
BINARIES = compaction_test
BINARIES += hugepage-mmap
BINARIES += hugepage-shm
-BINARIES += hugetlbfstest
BINARIES += map_hugetlb
+BINARIES += mlock2-tests
+BINARIES += on-fault-limit
BINARIES += thuge-gen
BINARIES += transhuge-stress
+BINARIES += userfaultfd
all: $(BINARIES)
%: %.c
$(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+ $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
TEST_PROGS := run_vmtests
TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/hugetlbfstest.c b/tools/testing/selftests/vm/hugetlbfstest.c
deleted file mode 100644
index 02e1072ec187..000000000000
--- a/tools/testing/selftests/vm/hugetlbfstest.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#define _GNU_SOURCE
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-typedef unsigned long long u64;
-
-static size_t length = 1 << 24;
-
-static u64 read_rss(void)
-{
- char buf[4096], *s = buf;
- int i, fd;
- u64 rss;
-
- fd = open("/proc/self/statm", O_RDONLY);
- assert(fd > 2);
- memset(buf, 0, sizeof(buf));
- read(fd, buf, sizeof(buf) - 1);
- for (i = 0; i < 1; i++)
- s = strchr(s, ' ') + 1;
- rss = strtoull(s, NULL, 10);
- return rss << 12; /* assumes 4k pagesize */
-}
-
-static void do_mmap(int fd, int extra_flags, int unmap)
-{
- int *p;
- int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
- u64 before, after;
- int ret;
-
- before = read_rss();
- p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
- assert(p != MAP_FAILED ||
- !"mmap returned an unexpected error");
- after = read_rss();
- assert(llabs(after - before - length) < 0x40000 ||
- !"rss didn't grow as expected");
- if (!unmap)
- return;
- ret = munmap(p, length);
- assert(!ret || !"munmap returned an unexpected error");
- after = read_rss();
- assert(llabs(after - before) < 0x40000 ||
- !"rss didn't shrink as expected");
-}
-
-static int open_file(const char *path)
-{
- int fd, err;
-
- unlink(path);
- fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
- | O_LARGEFILE | O_CLOEXEC, 0600);
- assert(fd > 2);
- unlink(path);
- err = ftruncate(fd, length);
- assert(!err);
- return fd;
-}
-
-int main(void)
-{
- int hugefd, fd;
-
- fd = open_file("/dev/shm/hugetlbhog");
- hugefd = open_file("/hugepages/hugetlbhog");
-
- system("echo 100 > /proc/sys/vm/nr_hugepages");
- do_mmap(-1, MAP_ANONYMOUS, 1);
- do_mmap(fd, 0, 1);
- do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
- do_mmap(hugefd, 0, 1);
- do_mmap(hugefd, MAP_HUGETLB, 1);
- /* Leak the last one to test do_exit() */
- do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
- printf("oll korrekt.\n");
- return 0;
-}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000000..af4bc752797d
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,657 @@
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <syscall.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ char *end_addr;
+ char *stop;
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ end_addr = strchr(line, '-');
+ if (!end_addr) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ *end_addr = '\0';
+ end_addr++;
+ stop = strchr(end_addr, ' ');
+ if (!stop) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ stop = '\0';
+
+ sscanf(line, "%lx", &start);
+ sscanf(end_addr, "%lx", &end);
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+static uint64_t get_pageflags(unsigned long addr)
+{
+ FILE *file;
+ uint64_t pfn;
+ unsigned long offset;
+
+ file = fopen("/proc/self/pagemap", "r");
+ if (!file) {
+ perror("fopen pagemap");
+ _exit(1);
+ }
+
+ offset = addr / getpagesize() * sizeof(pfn);
+
+ if (fseek(file, offset, SEEK_SET)) {
+ perror("fseek pagemap");
+ _exit(1);
+ }
+
+ if (fread(&pfn, sizeof(pfn), 1, file) != 1) {
+ perror("fread pagemap");
+ _exit(1);
+ }
+
+ fclose(file);
+ return pfn;
+}
+
+static unsigned long get_kpageflags(unsigned long pfn)
+{
+ uint64_t flags;
+ FILE *file;
+
+ file = fopen("/proc/kpageflags", "r");
+ if (!file) {
+ perror("fopen kpageflags");
+ _exit(1);
+ }
+
+ if (fseek(file, pfn * sizeof(flags), SEEK_SET)) {
+ perror("fseek kpageflags");
+ _exit(1);
+ }
+
+ if (fread(&flags, sizeof(flags), 1, file) != 1) {
+ perror("fread kpageflags");
+ _exit(1);
+ }
+
+ fclose(file);
+ return flags;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool find_flag(FILE *file, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+
+ while (getline(&line, &size, file) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ return ret;
+}
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ bool ret = false;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end) {
+ ret = find_flag(file, vmflag);
+ goto out;
+ }
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+out:
+ free(line);
+ fclose(file);
+ return ret;
+}
+
+#define PRESENT_BIT 0x8000000000000000
+#define PFN_MASK 0x007FFFFFFFFFFFFF
+#define UNEVICTABLE_BIT (1UL << 18)
+
+#define LOCKED "lo"
+#define LOCKEDONFAULT "lf"
+
+static int lock_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Both pages should be present */
+ if (((page1_flags & PRESENT_BIT) == 0) ||
+ ((page2_flags & PRESENT_BIT) == 0)) {
+ printf("Failed to make both pages present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Both pages should be unevictable */
+ if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
+ ((page2_flags & UNEVICTABLE_BIT) == 0)) {
+ printf("Failed to make both pages unevictable\n");
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKED) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is missing\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_lock_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) {
+ printf("A page is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map, LOCKED) ||
+ is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is still set after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_lock()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, 0)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(0)");
+ goto unmap;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Neither page should be present */
+ if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) {
+ printf("Pages were made present by MLOCK_ONFAULT\n");
+ return 1;
+ }
+
+ *map = 'a';
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Only page 1 should be present */
+ if ((page1_flags & PRESENT_BIT) == 0) {
+ printf("Page 1 is not present after fault\n");
+ return 1;
+ } else if (page2_flags & PRESENT_BIT) {
+ printf("Page 2 was made present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make faulted page unevictable\n");
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is missing\n", LOCKEDONFAULT);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ if (page1_flags & UNEVICTABLE_BIT) {
+ printf("Page 1 is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is still set after unlock\n", LOCKEDONFAULT);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_onfault()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_onfault_check(map);
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make present page unevictable\n");
+ goto unmap;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKEDONFAULT) ||
+ !is_vmflag_set((unsigned long)map + page_size, LOCKEDONFAULT)) {
+ printf("VMA flag %s is missing for one of the pages\n", LOCKEDONFAULT);
+ goto unmap;
+ }
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_munlockall()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT)) {
+ perror("mlockall(MCL_CURRENT)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_lock_check(map))
+ goto unmap;
+
+ munmap(map, 2 * page_size);
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall second mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+ perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_onfault_check(map))
+ goto unmap;
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ munlockall();
+ return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock(ONFAULT)\n");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("VMAs are not merged to start, aborting test\n");
+ ret = 0;
+ goto out;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ printf("failed to split VMA for munlock\n");
+ goto out;
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("failed to merge VMAs after munlock\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+ int ret = 1;
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ ret = test_function(false);
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+ ret += test_mlock_lock();
+ ret += test_mlock_onfault();
+ ret += test_munlockall();
+ ret += test_lock_onfault_of_present();
+ ret += test_vma_management(true);
+ ret += test_mlockall(test_vma_management);
+ return ret;
+}
+
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000000..245acccce42d
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,47 @@
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+ int ret = 1;
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+ perror("getrlimit");
+ return ret;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0);
+ if (map != MAP_FAILED)
+ printf("mmap should have failed, but didn't\n");
+ else {
+ ret = 0;
+ munmap(map, 2 * lims.rlim_max);
+ }
+
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+
+ ret += test_limit();
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..2df21b3bb26d 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -75,10 +75,14 @@ else
echo "[PASS]"
fi
+echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
+echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo " hugetlb regression testing."
+
echo "--------------------"
-echo "running hugetlbfstest"
+echo "running userfaultfd"
echo "--------------------"
-./hugetlbfstest
+./userfaultfd 128 32
if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
@@ -102,4 +106,26 @@ else
echo "[PASS]"
fi
+echo "--------------------"
+echo "running on-fault-limit"
+echo "--------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..2bf1fc3f562b
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,638 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+
+#ifndef __NR_userfaultfd
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 374
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+#endif
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+ unsigned long i;
+ for (i = 0; i < n; i++)
+ if (str1[i] != str2[i])
+ return 1;
+ return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct random_data rand;
+ unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+ int32_t rand_nr;
+ unsigned long long count;
+ char randstate[64];
+ unsigned int seed;
+ time_t start;
+
+ if (bounces & BOUNCE_RANDOM) {
+ seed = (unsigned int) time(NULL) - bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ seed += cpu;
+ bzero(&rand, sizeof(rand));
+ bzero(&randstate, sizeof(randstate));
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+ fprintf(stderr, "srandom_r error\n"), exit(1);
+ } else {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 1 error\n"), exit(1);
+ page_nr = rand_nr;
+ if (sizeof(page_nr) > sizeof(rand_nr)) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 2 error\n"), exit(1);
+ page_nr |= ((unsigned long) rand_nr) << 32;
+ }
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+
+ start = time(NULL);
+ if (bounces & BOUNCE_VERIFY) {
+ count = *area_count(area_dst, page_nr);
+ if (!count)
+ fprintf(stderr,
+ "page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+
+
+ /*
+ * We can't use bcmp (or memcmp) because that
+ * returns 0 erroneously if the memory is
+ * changing under it (even if the end of the
+ * page is never changing and always
+ * different).
+ */
+#if 1
+ if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size))
+ fprintf(stderr,
+ "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+#else
+ unsigned long loops;
+
+ loops = 0;
+ /* uncomment the below line to test with mutex */
+ /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+ while (!bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ loops += 1;
+ if (loops > 10)
+ break;
+ }
+ /* uncomment below line to test with mutex */
+ /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+ if (loops) {
+ fprintf(stderr,
+ "page_nr %lu all zero thread %lu %p %lu\n",
+ page_nr, cpu, area_dst + page_nr * page_size,
+ loops);
+ if (loops > 10)
+ exit(1);
+ }
+#endif
+ }
+
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr]) {
+ fprintf(stderr,
+ "page_nr %lu memory corruption %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+ }
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+ if (time(NULL) - start > 1)
+ fprintf(stderr,
+ "userfault too slow %ld "
+ "possible false positive with overcommit\n",
+ time(NULL) - start);
+ }
+
+ return NULL;
+}
+
+static int copy_page(unsigned long offset)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ fprintf(stderr, "unexpected offset %lu\n",
+ offset), exit(1);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else if (uffdio_copy.copy != page_size) {
+ fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else
+ return 1;
+ return 0;
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ int ret;
+ unsigned long offset;
+ char tmp_chr;
+ unsigned long userfaults = 0;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (!ret)
+ fprintf(stderr, "poll error %d\n", ret), exit(1);
+ if (ret < 0)
+ perror("poll"), exit(1);
+ if (pollfd[1].revents & POLLIN) {
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ fprintf(stderr, "read pipefd error\n"),
+ exit(1);
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ fprintf(stderr, "pollfd[0].revents %d\n",
+ pollfd[0].revents), exit(1);
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("nonblocking read error"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ userfaults++;
+ }
+ return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ unsigned long *this_cpu_userfaults;
+ struct uffd_msg msg;
+ unsigned long offset;
+ int ret;
+
+ this_cpu_userfaults = (unsigned long *) arg;
+ *this_cpu_userfaults = 0;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (ret < 0)
+ perror("blocking read error"), exit(1);
+ else
+ fprintf(stderr, "short read\n"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (bounces & BOUNCE_VERIFY &&
+ msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ (*this_cpu_userfaults)++;
+ }
+ return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+
+ for (page_nr = cpu * nr_pages_per_cpu;
+ page_nr < (cpu+1) * nr_pages_per_cpu;
+ page_nr++)
+ copy_page(page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+ void **_userfaults = (void **) userfaults;
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread, (void *)cpu))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ &_userfaults[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ return 1;
+ }
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+ fprintf(stderr, "pipefd write error\n");
+ return 1;
+ }
+ if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ struct uffdio_api uffdio_api;
+ unsigned long cpu;
+ int uffd_flags;
+ unsigned long userfaults[nr_cpus];
+
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_src = area;
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_dst = area;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+ return 1;
+ }
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) = (pthread_mutex_t)
+ PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ if (posix_memalign(&area, page_size, page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ while (bounces--) {
+ unsigned long expected_ioctls;
+
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+ expected_ioctls = (1 << _UFFDIO_WAKE) |
+ (1 << _UFFDIO_COPY) |
+ (1 << _UFFDIO_ZEROPAGE);
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ return 1;
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise 2");
+ return 1;
+ }
+
+ /* bounce pass */
+ if (stress(userfaults))
+ return 1;
+
+ /* unregister */
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY) {
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (my_bcmp(area_dst,
+ area_dst + nr * page_size,
+ sizeof(pthread_mutex_t))) {
+ fprintf(stderr,
+ "error mutex 2 %lu\n",
+ nr);
+ bounces = 0;
+ }
+ if (*area_count(area_dst, nr) != count_verify[nr]) {
+ fprintf(stderr,
+ "error area_count %Lu %Lu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr],
+ nr);
+ bounces = 0;
+ }
+ }
+ }
+
+ /* prepare next bounce */
+ tmp_area = area_src;
+ area_src = area_dst;
+ area_dst = tmp_area;
+
+ printf("userfaults:");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ printf(" %lu", userfaults[cpu]);
+ printf("\n");
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 3)
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+ page_size)
+ fprintf(stderr, "Impossible to run this test\n"), exit(2);
+ nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ bounces = atoi(argv[2]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 8bdf16b8ba60..7f73fa32a590 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -57,23 +57,15 @@
* pagemap kernel ABI bits
*/
-#define PM_ENTRY_BYTES sizeof(uint64_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
-
+#define PM_ENTRY_BYTES 8
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define PM_SOFT_DIRTY (1ULL << 55)
+#define PM_MMAP_EXCLUSIVE (1ULL << 56)
+#define PM_FILE (1ULL << 61)
+#define PM_SWAP (1ULL << 62)
+#define PM_PRESENT (1ULL << 63)
/*
* kernel page flags
@@ -100,6 +92,8 @@
#define KPF_SLOB_FREE 49
#define KPF_SLUB_FROZEN 50
#define KPF_SLUB_DEBUG 51
+#define KPF_FILE 62
+#define KPF_MMAP_EXCLUSIVE 63
#define KPF_ALL_BITS ((uint64_t)~0ULL)
#define KPF_HACKERS_BITS (0xffffULL << 32)
@@ -149,6 +143,9 @@ static const char * const page_flag_names[] = {
[KPF_SLOB_FREE] = "P:slob_free",
[KPF_SLUB_FROZEN] = "A:slub_frozen",
[KPF_SLUB_DEBUG] = "E:slub_debug",
+
+ [KPF_FILE] = "F:file",
+ [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
};
@@ -452,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
if (pme & PM_SOFT_DIRTY)
flags |= BIT(SOFTDIRTY);
+ if (pme & PM_FILE)
+ flags |= BIT(FILE);
+ if (pme & PM_MMAP_EXCLUSIVE)
+ flags |= BIT(MMAP_EXCLUSIVE);
return flags;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8db2f8fce9c..268fc0a5a932 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
return young;
}
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young, idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ /*
+ * Even though we do not flush TLB, this will still adversely
+ * affect performance on pre-Haswell Intel EPT, where there is
+ * no EPT Access Bit to clear so that we have to tear down EPT
+ * tables instead. If we find this unacceptable, we can always
+ * add a parameter to kvm_age_hva so that it effectively doesn't
+ * do anything on clear_young.
+ *
+ * Also note that currently we never issue secondary TLB flushes
+ * from clear_young, leaving this job up to the regular system
+ * cadence. If we find this inaccurate, we might come up with a
+ * more sophisticated heuristic later.
+ */
+ young = kvm_age_hva(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return young;
+}
+
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address)
@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,