aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Brown <broonie@kernel.org>2015-12-03 00:23:12 +0000
committerMark Brown <broonie@kernel.org>2015-12-03 00:23:12 +0000
commit6ba8c039ff230ed2a4fd67054b4f81cc34c6278b (patch)
treec800ca72db2c381e2033ab5970d8b2491cc9e44f
parentbcf8c8f9d0249881c872d3e4f9f9236c366775d3 (diff)
parent7d182af8dd2051455c0a8fdedc6a9d3f504c2b55 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/features/vm/pmdp_splitting_flush/arch-support.txt40
-rw-r--r--Documentation/filesystems/proc.txt21
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/sysctl/vm.txt29
-rw-r--r--Documentation/vm/page_owner.txt9
-rw-r--r--Documentation/vm/transhuge.txt151
-rw-r--r--arch/Kconfig64
-rw-r--r--arch/arc/Kconfig3
-rw-r--r--arch/arc/mm/cache.c4
-rw-r--r--arch/arm/Kconfig15
-rw-r--r--arch/arm/include/asm/pgtable-3level.h9
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c5
-rw-r--r--arch/arm/mm/dma-mapping.c11
-rw-r--r--arch/arm/mm/flush.c17
-rw-r--r--arch/arm/mm/mmap.c3
-rw-r--r--arch/arm64/Kconfig23
-rw-r--r--arch/arm64/include/asm/pgtable.h8
-rw-r--r--arch/arm64/mm/flush.c16
-rw-r--r--arch/arm64/mm/mmap.c6
-rw-r--r--arch/frv/include/asm/io.h17
-rw-r--r--arch/metag/Kconfig3
-rw-r--r--arch/microblaze/Kconfig3
-rw-r--r--arch/mips/include/asm/pgtable-bits.h10
-rw-r--r--arch/mips/include/asm/pgtable.h18
-rw-r--r--arch/mips/mm/c-r4k.c3
-rw-r--r--arch/mips/mm/cache.c2
-rw-r--r--arch/mips/mm/gup.c17
-rw-r--r--arch/mips/mm/init.c6
-rw-r--r--arch/mips/mm/pgtable-64.c14
-rw-r--r--arch/mips/mm/tlbex.c1
-rw-r--r--arch/parisc/Kconfig3
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/fadump.h2
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h27
-rw-r--r--arch/powerpc/kernel/fadump.c4
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c17
-rw-r--r--arch/powerpc/mm/pgtable_64.c49
-rw-r--r--arch/powerpc/mm/subpage-prot.c2
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c2
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/pgtable.h16
-rw-r--r--arch/s390/mm/gup.c24
-rw-r--r--arch/s390/mm/pgtable.c21
-rw-r--r--arch/sh/Kconfig3
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/sh/mm/cache.c8
-rw-r--r--arch/sparc/Kconfig4
-rw-r--r--arch/sparc/include/asm/pgtable_64.h16
-rw-r--r--arch/sparc/mm/fault_64.c3
-rw-r--r--arch/sparc/mm/gup.c16
-rw-r--r--arch/tile/include/asm/pgtable.h10
-rw-r--r--arch/unicore32/Kconfig3
-rw-r--r--arch/x86/Kconfig19
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/mm/gup.c17
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--arch/x86/mm/pgtable.c13
-rw-r--r--arch/xtensa/mm/tlb.c2
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/acpi/apei/erst.c6
-rw-r--r--drivers/base/cpu.c10
-rw-r--r--drivers/block/drbd/drbd_bitmap.c26
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/char/mspec.c15
-rw-r--r--drivers/firmware/broadcom/bcm47xx_nvram.c11
-rw-r--r--drivers/gpu/drm/drm_hashtab.c5
-rw-r--r--drivers/iio/industrialio-core.c9
-rw-r--r--drivers/net/wireless/intel/iwlwifi/dvm/calib.c2
-rw-r--r--drivers/soc/qcom/smd.c13
-rw-r--r--drivers/staging/lustre/include/linux/libcfs/libcfs_private.h8
-rw-r--r--drivers/staging/lustre/lustre/llite/super25.c3
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/adfs/adfs.h28
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/coda/coda_linux.h3
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/exofs/inode.c5
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/super.c5
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c61
-rw-r--r--fs/fat/inode.c77
-rw-r--r--fs/file.c7
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/build.c8
-rw-r--r--fs/jffs2/fs.c5
-rw-r--r--fs/jffs2/super.c7
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c9
-rw-r--r--fs/logfs/inode.c3
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/nilfs2/super.c3
-rw-r--r--fs/notify/inode_mark.c3
-rw-r--r--fs/notify/mark.c66
-rw-r--r--fs/ntfs/super.c4
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1136
-rw-r--r--fs/ocfs2/aops.h18
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c24
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c173
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c23
-rw-r--r--fs/ocfs2/localalloc.h2
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h16
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/super.c66
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/task_mmu.c125
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/udf/super.c10
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/xattr.c33
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--include/asm-generic/memory_model.h4
-rw-r--r--include/asm-generic/pgtable.h9
-rw-r--r--include/asm-generic/sections.h65
-rw-r--r--include/linux/cpumask.h55
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/fsnotify_backend.h5
-rw-r--r--include/linux/gfp.h20
-rw-r--r--include/linux/huge_mm.h61
-rw-r--r--include/linux/hugetlb.h10
-rw-r--r--include/linux/io.h1
-rw-r--r--include/linux/kdev_t.h5
-rw-r--r--include/linux/kernel.h36
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/lz4.h4
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h43
-rw-r--r--include/linux/mempolicy.h4
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h148
-rw-r--r--include/linux/mm_types.h27
-rw-r--r--include/linux/mmdebug.h7
-rw-r--r--include/linux/mmzone.h34
-rw-r--r--include/linux/page-flags.h286
-rw-r--r--include/linux/page_ext.h1
-rw-r--r--include/linux/page_owner.h50
-rw-r--r--include/linux/pagemap.h38
-rw-r--r--include/linux/pfn.h1
-rw-r--r--include/linux/poison.h6
-rw-r--r--include/linux/rmap.h35
-rw-r--r--include/linux/shm.h6
-rw-r--r--include/linux/shmem_fs.h4
-rw-r--r--include/linux/slab.h5
-rw-r--r--include/linux/stop_machine.h6
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/thread_info.h5
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--include/trace/events/gfpflags.h14
-rw-r--r--include/trace/events/huge_memory.h166
-rw-r--r--include/trace/events/page_isolation.h38
-rw-r--r--include/trace/events/vmscan.h21
-rw-r--r--init/Kconfig7
-rw-r--r--init/do_mounts.h4
-rw-r--r--init/do_mounts_initrd.c6
-rw-r--r--init/main.c10
-rw-r--r--ipc/mqueue.c2
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/sem.c2
-rw-r--r--ipc/shm.c2
-rw-r--r--ipc/util.c11
-rw-r--r--ipc/util.h2
-rw-r--r--kernel/cpu.c64
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/futex.c63
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/printk/printk.c26
-rw-r--r--kernel/ptrace.c10
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sysctl.c22
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--lib/Makefile3
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/iomap_copy.c21
-rw-r--r--lib/string_helpers.c63
-rw-r--r--lib/test_hexdump.c (renamed from lib/test-hexdump.c)120
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/debug.c52
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c114
-rw-r--r--mm/huge_memory.c1447
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h74
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c63
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c96
-rw-r--r--mm/memory-failure.c77
-rw-r--r--mm/memory.c107
-rw-r--r--mm/mempolicy.c70
-rw-r--r--mm/migrate.c44
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c14
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mmzone.c8
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c129
-rw-r--r--mm/page_idle.c27
-rw-r--r--mm/page_isolation.c18
-rw-r--r--mm/page_owner.c82
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c18
-rw-r--r--mm/pgtable-generic.c21
-rw-r--r--mm/rmap.c314
-rw-r--r--mm/shmem.c108
-rw-r--r--mm/slab.h5
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap.c274
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/userfaultfd.c8
-rw-r--r--mm/util.c54
-rw-r--r--mm/vmalloc.c23
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c29
-rw-r--r--mm/vmstat.c25
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/tcp_memcontrol.c17
-rw-r--r--net/socket.c2
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rwxr-xr-xscripts/checkpatch.pl49
-rwxr-xr-xscripts/tags.sh2
286 files changed, 5068 insertions, 3894 deletions
diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644
index 26f74b457e0b..000000000000
--- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name: pmdp_splitting_flush
-# Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-# description: arch supports the pmdp_splitting_flush() VM API
-#
- -----------------------
- | arch |status|
- -----------------------
- | alpha: | TODO |
- | arc: | TODO |
- | arm: | ok |
- | arm64: | ok |
- | avr32: | TODO |
- | blackfin: | TODO |
- | c6x: | TODO |
- | cris: | TODO |
- | frv: | TODO |
- | h8300: | TODO |
- | hexagon: | TODO |
- | ia64: | TODO |
- | m32r: | TODO |
- | m68k: | TODO |
- | metag: | TODO |
- | microblaze: | TODO |
- | mips: | ok |
- | mn10300: | TODO |
- | nios2: | TODO |
- | openrisc: | TODO |
- | parisc: | TODO |
- | powerpc: | ok |
- | s390: | ok |
- | score: | TODO |
- | sh: | TODO |
- | sparc: | TODO |
- | tile: | TODO |
- | um: | TODO |
- | unicore32: | TODO |
- | x86: | ok |
- | xtensa: | TODO |
- -----------------------
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 402ab99e409f..ffcd49589ab5 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -169,6 +169,9 @@ read the file /proc/PID/status:
VmLck: 0 kB
VmHWM: 476 kB
VmRSS: 476 kB
+ RssAnon: 352 kB
+ RssFile: 120 kB
+ RssShmem: 4 kB
VmData: 156 kB
VmStk: 88 kB
VmExe: 68 kB
@@ -231,14 +234,20 @@ Table 1-2: Contents of the status files (as of 4.1)
VmSize total program size
VmLck locked memory size
VmHWM peak resident set size ("high water mark")
- VmRSS size of memory portions
+ VmRSS size of memory portions. It contains the three
+ following parts (VmRSS = RssAnon + RssFile + RssShmem)
+ RssAnon size of resident anonymous memory
+ RssFile size of resident file mappings
+ RssShmem size of resident shmem memory (includes SysV shm,
+ mapping of tmpfs and shared anonymous mappings)
VmData size of data, stack, and text segments
VmStk size of data, stack, and text segments
VmExe size of text segment
VmLib size of shared library code
VmPTE size of page table entries
VmPMD size of second level page tables
- VmSwap size of swap usage (the number of referred swapents)
+ VmSwap amount of swap used by anonymous private data
+ (shmem swap usage is not included)
HugetlbPages size of hugetlb memory portions
Threads number of threads
SigQ number of signals queued/max. number for queue
@@ -265,7 +274,8 @@ Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
Field Content
size total program size (pages) (same as VmSize in status)
resident size of memory portions (pages) (same as VmRSS in status)
- shared number of pages that are shared (i.e. backed by a file)
+ shared number of pages that are shared (i.e. backed by a file, same
+ as RssFile+RssShmem in status)
trs number of pages that are 'code' (not including libs; broken,
includes data segment)
lrs number of pages of library (always 0 on 2.6)
@@ -459,7 +469,10 @@ and a page is modified, the file page is replaced by a private anonymous copy.
hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
"Swap" shows how much would-be-anonymous memory is also used, but out on swap.
-"SwapPss" shows proportional swap share of this mapping.
+For shmem mappings, "Swap" includes also the size of the mapped (and not
+replaced by copy-on-write) part of the underlying shmem object out on swap.
+"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
+does not take into account swapped out page of underlying shmem objects.
"Locked" indicates whether the mapping is locked in memory or not.
"VmFlags" field deserves a separate description. This member represents the kernel
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index f72370b440b1..d77a81a1b512 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -42,6 +42,8 @@ Currently, these files are in /proc/sys/vm:
- min_slab_ratio
- min_unmapped_ratio
- mmap_min_addr
+- mmap_rnd_bits
+- mmap_rnd_compat_bits
- nr_hugepages
- nr_overcommit_hugepages
- nr_trim_pages (only if CONFIG_MMU=n)
@@ -485,6 +487,33 @@ against future potential kernel bugs.
==============================================================
+mmap_rnd_bits:
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations on architectures which support
+tuning address space randomization. This value will be bounded
+by the architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/kernel/mmap_rnd_bits tunable
+
+==============================================================
+
+mmap_rnd_compat_bits:
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations for applications run in
+compatibility mode on architectures which support tuning address
+space randomization. This value will be bounded by the
+architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/kernel/mmap_rnd_compat_bits tunable
+
+==============================================================
+
nr_hugepages
Change the minimum size of the hugepage pool.
diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt
index 8f3ce9b3aa11..ffff1439076a 100644
--- a/Documentation/vm/page_owner.txt
+++ b/Documentation/vm/page_owner.txt
@@ -28,10 +28,11 @@ with page owner and page owner is disabled in runtime due to no enabling
boot option, runtime overhead is marginal. If disabled in runtime, it
doesn't require memory to store owner information, so there is no runtime
memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if it returns false then allocation is
-done like as the kernel without page owner. These two unlikely branches
-would not affect to allocation performance. Following is the kernel's
-code size change due to this facility.
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
- Without page owner
text data bss dec hex filename
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8a282687ee06..21cf34f3ddb2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -35,10 +35,10 @@ miss is going to run faster.
== Design ==
-- "graceful fallback": mm components which don't have transparent
- hugepage knowledge fall back to breaking a transparent hugepage and
- working on the regular pages and their respective regular pmd/pte
- mappings
+- "graceful fallback": mm components which don't have transparent hugepage
+ knowledge fall back to breaking huge pmd mapping into table of ptes and,
+ if necessary, split a transparent hugepage. Therefore these components
+ can continue working on the regular pages or regular pte mappings.
- if a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
of pages that should be collapsed into one huge page but failed
the allocation.
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.
+ This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed is is incremented if kernel fails to split huge
+ page. This can happen if the page was pinned by somebody.
+
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+ This can happen, for instance, when application calls mprotect() or
+ munmap() on part of huge page. It doesn't split huge page, only
+ page table entry.
thp_zero_page_alloc is incremented every time a huge zero page is
successfully allocated. It includes allocations which where
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
if any driver is going to mangle over the page structure of the tail
page (like for checking page->mapping or other bits that are relevant
for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
-split_huge_page() to avoid the head and tail pages to disappear from
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
NOTE: these aren't new constraints to the GUP API, and they match the
same constrains that applies to hugetlbfs too, so any driver capable
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==
Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
fallback design, with a one liner change, you can avoid to write
hundred if not thousand of lines of complex code to make your code
@@ -323,7 +330,8 @@ hugepage aware.
If you're not walking pagetables but you run into a physical hugepage
but you can't handle it natively in your code, you can split it by
calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
Example to make mremap.c transparent hugepage aware with a one liner
change:
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;
pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(vma, addr, pmd);
++ split_huge_pmd(vma, pmd, addr);
if (pmd_none_or_clear_bad(pmd))
return NULL;
== Locking in hugepage aware code ==
We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
To make pagetable walks huge pmd aware, all you need to do is to call
pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
takes the mmap_sem in write mode in addition to the anon_vma lock). If
pmd_trans_huge returns false, you just fallback in the old code
paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
-case pmd_trans_splitting returns true, it means split_huge_page is
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
-
-== compound_lock, get_user_pages and put_page ==
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+== Refcounts and transparent huge pages ==
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+ - get_page()/put_page() and GUP operate in head page's ->_count.
+
+ - ->_count in tail pages is always zero: get_page_unless_zero() never
+ succeed on tail pages.
+
+ - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+ on relevant sub-page of the compound page.
+
+ - map/unmap of the whole compound page accounted in compound_mapcount
+ (stored in first tail page).
+
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
-page structures. It can do that easily for refcounts taken by huge pmd
-mappings. But the GUI API as created by hugetlbfs (that returns head
-and tail pages if running get_user_pages on an address backed by any
-hugepage), requires the refcount to be accounted on the tail pages and
-not only in the head pages, if we want to be able to run
-split_huge_page while there are gup pins established on any tail
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
-get_user_pages which is unacceptable as too many gup users are
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
-hugetlbfs pages cannot be split so there wouldn't be requirement of
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
-anymore which tail page is pinned by gup and which is not while we run
-split_huge_page. But we still have to add the gup pin to the head page
-too, to know when we can free the compound page in case it's never
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
-page (and only on a tail page) it will find its respective head page,
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
-refcount without race conditions, put_page has to serialize against
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_count and
+page->_mapcount.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
+from geting reference to tail page up to the point. After the atomic_add()
+we don't care about ->_count value. We already known how many references
+with should uncharge from head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+== Partial unmap and deferred_split_huge_page() ==
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/arch/Kconfig b/arch/Kconfig
index 4e949e58b192..141823f19a8b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -511,6 +511,70 @@ config ARCH_HAS_ELF_RANDOMIZE
- arch_mmap_rnd()
- arch_randomize_brk()
+config HAVE_ARCH_MMAP_RND_BITS
+ bool
+ help
+ An arch should select this symbol if it supports setting a variable
+ number of bits for use in establishing the base address for mmap
+ allocations and provides values for both:
+ - ARCH_MMAP_RND_BITS_MIN
+ - ARCH_MMAP_RND_BITS_MAX
+
+config ARCH_MMAP_RND_BITS_MIN
+ int
+
+config ARCH_MMAP_RND_BITS_MAX
+ int
+
+config ARCH_MMAP_RND_BITS_DEFAULT
+ int
+
+config ARCH_MMAP_RND_BITS
+ int "Number of bits to use for ASLR of mmap base address" if EXPERT
+ range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
+ default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
+ default ARCH_MMAP_RND_BITS_MIN
+ depends on HAVE_ARCH_MMAP_RND_BITS
+ help
+ This value can be used to select the number of bits to use to
+ determine the random offset to the base address of vma regions
+ resulting from mmap allocations. This value will be bounded
+ by the architecture's minimum and maximum supported values.
+
+ This value can be changed after boot using the
+ /proc/sys/kernel/mmap_rnd_bits tunable
+
+config HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ bool
+ help
+ An arch should select this symbol if it supports running applications
+ in compatibility mode, supports setting a variable number of bits for
+ use in establishing the base address for mmap allocations, and
+ provides values for both:
+ - ARCH_MMAP_RND_COMPAT_BITS_MIN
+ - ARCH_MMAP_RND_COMPAT_BITS_MAX
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ int
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ int
+
+config ARCH_MMAP_RND_COMPAT_BITS
+ int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
+ range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default ARCH_MMAP_RND_COMPAT_BITS_MIN
+ depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ help
+ This value can be used to select the number of bits to use to
+ determine the random offset to the base address of vma regions
+ resulting from mmap allocations for compatible applications This
+ value will be bounded by the architecture's minimum and maximum
+ supported values.
+
+ This value can be changed after boot using the
+ /proc/sys/kernel/mmap_rnd_compat_bits tunable
+
config HAVE_COPY_THREAD_TLS
bool
help
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 2c2ac3f3ff80..6dc312fd6480 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -73,9 +73,6 @@ config STACKTRACE_SUPPORT
def_bool y
select STACKTRACE
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
depends on ARC_MMU_V4
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index ff7ff6cbb811..b65f797e9ad6 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
*/
if (!mapping_mapped(mapping)) {
clear_bit(PG_dc_clean, &page->flags);
- } else if (page_mapped(page)) {
+ } else if (page_mapcount(page)) {
/* kernel reading from page with U-mapping */
phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
* For !VIPT cache, all of this gets compiled out as
* addr_not_cache_congruent() is 0
*/
- if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+ if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
__flush_dcache_page((unsigned long)kfrom, u_vaddr);
clean_src_k_mappings = 1;
}
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 253ddddc95c0..32e220e90354 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -35,6 +35,7 @@ config ARM
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
+ select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT
@@ -164,11 +165,6 @@ config STACKTRACE_SUPPORT
bool
default y
-config HAVE_LATENCYTOP_SUPPORT
- bool
- depends on !SMP
- default y
-
config LOCKDEP_SUPPORT
bool
default y
@@ -307,6 +303,15 @@ config MMU
Select if you want MMU-based virtualised addressing space
support by paged memory management. If unsure, say 'Y'.
+config ARCH_MMAP_RND_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 14 if MMU && PAGE_OFFSET=0x40000000
+ default 15 if MMU && PAGE_OFFSET=0x80000000
+ default 16 if MMU
+ default 8
+
#
# The "ARM system type" choice list is ordered alphabetically by option
# text. Please add new entries in the option alphabetic order.
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..59d1457ca551 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -88,7 +88,6 @@
#define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0)
#define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING (_AT(pmdval_t, 1) << 56)
#define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57)
#define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58)
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif
#endif
#define PMD_BIT_FUNC(fn,op) \
@@ -246,7 +238,6 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index d72b90905132..96554afe1b83 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
*
* Lock the page table for the destination and check
* to see that it's still huge and whether or not we will
- * need to fault on write, or if we have a splitting THP.
+ * need to fault on write.
*/
if (unlikely(pmd_thp_or_huge(*pmd))) {
ptl = &current->mm->page_table_lock;
spin_lock(ptl);
if (unlikely(!pmd_thp_or_huge(*pmd)
- || pmd_hugewillfault(*pmd)
- || pmd_trans_splitting(*pmd))) {
+ || pmd_hugewillfault(*pmd))) {
spin_unlock(ptl);
return 0;
}
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index e62400e5fb99..492bf3efffab 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1200,10 +1200,7 @@ error:
while (i--)
if (pages[i])
__free_pages(pages[i], 0);
- if (array_size <= PAGE_SIZE)
- kfree(pages);
- else
- vfree(pages);
+ kvfree(pages);
return NULL;
}
@@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
size_t size, struct dma_attrs *attrs)
{
int count = size >> PAGE_SHIFT;
- int array_size = count * sizeof(struct page *);
int i;
if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
@@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
__free_pages(pages[i], 0);
}
- if (array_size <= PAGE_SIZE)
- kfree(pages);
- else
- vfree(pages);
+ kvfree(pages);
return 0;
}
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1ec8e7590fc6..d0ba3551d49a 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
mapping = page_mapping(page);
if (!cache_ops_need_broadcast() &&
- mapping && !page_mapped(page))
+ mapping && !page_mapcount(page))
clear_bit(PG_dcache_clean, &page->flags);
else {
__flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
*/
__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 407dc786583a..c938693ebc14 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -173,8 +173,7 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- /* 8 bits of randomness in 20 address space bits */
- rnd = (unsigned long)get_random_int() % (1 << 8);
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_bits);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 987ae39cc471..753bd9f7c91f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -51,6 +51,8 @@ config ARM64
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_MMAP_RND_BITS
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT
@@ -104,6 +106,27 @@ config ARCH_PHYS_ADDR_T_64BIT
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 15 if ARM64_64K_PAGES
+ default 19
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 20 if ARM64_64K_PAGES && ARCH_VA_BITS=39
+ default 24 if ARCH_VA_BITS=39
+ default 23 if ARM64_64K_PAGES && ARCH_VA_BITS=42
+ default 27 if ARCH_VA_BITS=42
+ default 29 if ARM64_64K_PAGES && ARCH_VA_BITS=48
+ default 33 if ARCH_VA_BITS=48
+ default 15 if ARM64_64K_PAGES
+ default 19
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 7 if ARM64_64K_PAGES
+ default 11
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config NO_IOPORT_MAP
def_bool y if !PCI
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7e074f93f383..d2a1879b466b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -333,19 +333,11 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c26b804015e8..3d59de89c042 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -99,19 +99,3 @@ EXPORT_SYMBOL(flush_dcache_page);
* Additional functions defined in assembly.
*/
EXPORT_SYMBOL(flush_icache_range);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
-
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index ed177475dd8c..b84d5b1b77ee 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -51,8 +51,10 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- rnd = (unsigned long)get_random_int() & STACK_RND_MASK;
-
+ if (test_thread_flag(TIF_32BIT))
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_compat_bits);
+ else
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_bits);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/frv/include/asm/io.h b/arch/frv/include/asm/io.h
index 70dfbea8c8d7..8062fc73fad0 100644
--- a/arch/frv/include/asm/io.h
+++ b/arch/frv/include/asm/io.h
@@ -43,9 +43,20 @@ static inline unsigned long _swapl(unsigned long v)
//#define __iormb() asm volatile("membar")
//#define __iowmb() asm volatile("membar")
-#define __raw_readb __builtin_read8
-#define __raw_readw __builtin_read16
-#define __raw_readl __builtin_read32
+static inline u8 __raw_readb(const volatile void __iomem *addr)
+{
+ return __builtin_read8((volatile void __iomem *)addr);
+}
+
+static inline u16 __raw_readw(const volatile void __iomem *addr)
+{
+ return __builtin_read16((volatile void __iomem *)addr);
+}
+
+static inline u32 __raw_readl(const volatile void __iomem *addr)
+{
+ return __builtin_read32((volatile void __iomem *)addr);
+}
#define __raw_writeb(datum, addr) __builtin_write8(addr, datum)
#define __raw_writew(datum, addr) __builtin_write16(addr, datum)
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..a0fa88da3e31 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -36,9 +36,6 @@ config STACKTRACE_SUPPORT
config LOCKDEP_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config RWSEM_GENERIC_SPINLOCK
def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 0bce820428fc..5ecd0287a874 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -67,9 +67,6 @@ config STACKTRACE_SUPPORT
config LOCKDEP_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h
index ff7ad91c85db..97b313882678 100644
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -131,14 +131,12 @@
/* Huge TLB page */
#define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING (1 << _PAGE_SPLITTING_SHIFT)
#endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
/* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
-#define _PAGE_NO_EXEC_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#ifdef _PAGE_HUGE_SHIFT
+#define _PAGE_NO_EXEC_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
@@ -153,8 +151,8 @@
#if defined(_PAGE_NO_READ_SHIFT)
#define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
-#define _PAGE_GLOBAL_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#elif defined(_PAGE_HUGE_SHIFT)
+#define _PAGE_GLOBAL_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 8957f15e21ec..6995b4a02e23 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
return pmd;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
-
- return pmd;
-}
-
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5d3a25e1cfae..caac3d747a90 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
* another ASID than the current one.
*/
map_coherent = (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page));
+ page_mapcount(page) &&
+ !Page_dcache_dirty(page));
if (map_coherent)
vaddr = kmap_coherent(page, addr);
else
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index aab218c36e0d..3f159caf6dbc 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (page_mapped(page) && !Page_dcache_dirty(page)) {
+ if (page_mapcount(page) && !Page_dcache_dirty(page)) {
void *kaddr;
kaddr = kmap_coherent(page, vmaddr);
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 349995d19c7f..1afd87c999b0 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_huge(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 8770e619185e..7e5fa0938c21 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
vto = kmap_atomic(to);
if (cpu_has_dc_aliases &&
- page_mapped(from) && !Page_dcache_dirty(from)) {
+ page_mapcount(from) && !Page_dcache_dirty(from)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
kunmap_coherent();
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index e8adc0069d66..ce4473e7c0d2 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
}
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
-{
- if (!pmd_trans_splitting(*pmdp)) {
- pmd_t pmd = pmd_mksplitting(*pmdp);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- }
-}
-
-#endif
-
pmd_t mk_pmd(struct page *page, pgprot_t prot)
{
pmd_t pmd;
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 32e0be27673f..482192cc8f2b 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
- pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
#endif
#ifdef CONFIG_CPU_MIPSR2
if (cpu_has_rixi) {
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 729f89163bc3..7c34cafdf301 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -79,9 +79,6 @@ config TIME_LOW_RES
depends on SMP
default y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
# unless you want to implement ACPI on PA-RISC ... ;-)
config PM
bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db49e0d796b1..89210bfdfc7a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -47,9 +47,6 @@ config STACKTRACE_SUPPORT
bool
default y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config TRACE_IRQFLAGS_SUPPORT
bool
default y
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 493e72f64b35..b4407d0add27 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -191,7 +191,7 @@ struct fadump_crash_info_header {
u64 elfcorehdr_addr;
u32 crashing_cpu;
struct pt_regs regs;
- struct cpumask cpu_online_mask;
+ struct cpumask online_mask;
};
/* Crash memory ranges */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3245f2d96d4f..0db2a3f8e554 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -374,11 +374,6 @@ void pgtable_cache_init(void);
#endif /* __ASSEMBLY__ */
/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
* We need to differentiate between explicit huge page and THP huge
* page, since THP huge page also need to track real subpage details
*/
@@ -387,9 +382,8 @@ void pgtable_cache_init(void);
/*
* set of bits not changed in pmd_modify.
*/
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \
- _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
- _PAGE_THP_HUGE)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | _PAGE_THP_HUGE)
#ifndef __ASSEMBLY__
/*
@@ -471,13 +465,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- if (pmd_trans_huge(pmd))
- return pmd_val(pmd) & _PAGE_SPLITTING;
- return 0;
-}
-
extern int has_transparent_hugepage(void);
#else
static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
@@ -536,12 +523,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
return pmd;
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
- return pmd;
-}
-
#define __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
@@ -592,10 +573,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#define pmdp_collapse_flush pmdp_collapse_flush
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 26d091a1a54c..3cb3b02a13dd 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -415,7 +415,7 @@ void crash_fadump(struct pt_regs *regs, const char *str)
else
ppc_save_regs(&fdh->regs);
- fdh->cpu_online_mask = *cpu_online_mask;
+ fdh->online_mask = *cpu_online_mask;
/* Call ibm,os-term rtas call to trigger firmware assisted dump */
rtas_os_term((char *)str);
@@ -646,7 +646,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
}
/* Lower 4 bytes of reg_value contains logical cpu id */
cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK;
- if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) {
+ if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
SKIP_TO_NEXT_CPU(reg_entry);
continue;
}
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 4d87122cf6a7..6ffade530dab 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
- /* If PMD is trans splitting retry the access */
- if (unlikely(old_pmd & _PAGE_SPLITTING))
- return 0;
/* If PMD permissions don't match, take page fault */
if (unlikely(access & ~old_pmd))
return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9833fee493ec..cd2d82efe1cd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1030,10 +1030,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
/*
* A hugepage collapse is captured by pmd_none, because
* it mark the pmd none and do a hpte invalidate.
- *
- * We don't worry about pmd_trans_splitting here, The
- * caller if it needs to handle the splitting case
- * should check for that.
*/
if (pmd_none(pmd))
return NULL;
@@ -1071,7 +1067,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
{
unsigned long mask;
unsigned long pte_end;
- struct page *head, *page, *tail;
+ struct page *head, *page;
pte_t pte;
int refs;
@@ -1094,7 +1090,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -1116,15 +1111,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e92cb2146b18..422c59a24561 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -604,55 +604,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
}
/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- unsigned long old, tmp;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- ori %1,%0,%4 \n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
- /*
- * If we didn't had the splitting flag set, go and flush the
- * HPTE entries.
- */
- trace_hugepage_splitting(address, old);
- if (!(old & _PAGE_SPLITTING)) {
- /* We need to flush the hpte */
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
- }
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- */
- kick_all_cpus_sync();
-}
-
-/*
* We want to put the pgtable in pmd and use pgtable for tracking
* the base page size hptes
*/
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index fa9fb5b4c66c..d5543514c1df 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
return 0;
}
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 11634fa7ab3c..ad4840f86be1 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -767,7 +767,7 @@ static int __init spufs_init(void)
ret = -ENOMEM;
spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
sizeof(struct spufs_inode_info), 0,
- SLAB_HWCACHE_ALIGN, spufs_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once);
if (!spufs_inode_cache)
goto out;
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 840abf4c82af..e648f1af50a1 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -10,9 +10,6 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config RWSEM_GENERIC_SPINLOCK
bool
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 024f85f947ae..64ead8091248 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT 0x0800 /* THP splitting bit */
#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */
#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
* SW-bits: y young, d dirty, r read, w write
*/
-#define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */
-
/* Page status table bits for virtualization */
#define PGSTE_ACC_BITS 0xf000000000000000UL
#define PGSTE_FP_BIT 0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
if (pmd_large(pmd)) {
pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
- _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
- _SEGMENT_ENTRY_SOFT_DIRTY;
+ _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
pmd_val(pmd) |= massage_pgprot_pmd(newprot);
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
#define __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
- (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
-
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 21c74a71e2ab..13dab0c1645c 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask, result;
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
pmd = *pmdp;
barrier();
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush() has to serialize with
- * smp_call_function() against our disabled IRQs, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
/*
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc01b43..43b9a48995f9 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -603,10 +603,7 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
else if (is_migration_entry(entry)) {
struct page *page = migration_entry_to_page(entry);
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
}
free_swap_and_cache(entry);
}
@@ -1308,22 +1305,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
return 1;
}
-static void pmdp_splitting_flush_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
- (unsigned long *) pmdp)) {
- /* need to serialize against gup-fast (IRQ disabled) */
- smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
- }
-}
-
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d514df7e04dd..6c391a5d3e5c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -130,9 +130,6 @@ config STACKTRACE_SUPPORT
config LOCKDEP_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config ARCH_HAS_ILOG2_U32
def_bool n
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 51d8f7f31d1d..58aaa4f33b81 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
*/
map_coherent = (current_cpu_data.dcache.n_aliases &&
test_bit(PG_dcache_clean, &page->flags) &&
- page_mapped(page));
+ page_mapcount(page));
if (map_coherent)
vaddr = kmap_coherent(page, address);
else
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index f770e3992620..e58cfbf45150 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
vto = kmap_atomic(to);
- if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
test_bit(PG_dcache_clean, &from->flags)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *kaddr;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 56442d2d7bbc..3203e42190dd 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -101,10 +101,6 @@ config LOCKDEP_SUPPORT
bool
default y if SPARC64
-config HAVE_LATENCYTOP_SUPPORT
- bool
- default y if SPARC64
-
config ARCH_HIBERNATION_POSSIBLE
def_bool y if SPARC64
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..f5bfcd66aeb5 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
- pte_t pte = __pte(pmd_val(pmd));
-
- return pmd_trans_huge(pmd) && pte_special(pte);
-}
-
#define has_transparent_hugepage() 1
static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -735,15 +728,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pte_t pte = __pte(pmd_val(pmd));
-
- pte = pte_mkspecial(pte);
-
- return __pmd(pte_val(pte));
-}
-
static inline pgprot_t pmd_pgprot(pmd_t entry)
{
unsigned long val = pmd_val(entry);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index dbabe5713a15..cb841a33da59 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmdp)) {
- if (pmd_trans_splitting(*pmdp))
- goto out_irq_enable;
-
pa = pmd_pfn(*pmdp) << PAGE_SHIFT;
pa += tpc & ~HPAGE_MASK;
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 2e5c4fc2daa9..eb3d8e8ebc6b 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
put_page(head);
return 0;
}
- if (head != page)
- get_huge_page_tail(page);
pages[*nr] = page;
(*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages,
int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
return 0;
}
- /* Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 2b05ccbebed9..96cecf55522e 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#define pmd_trans_huge pmd_huge_page
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return hv_pte_get_client2(pmd_pte(pmd));
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index c9faddc61100..910aaaaed969 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -33,9 +33,6 @@ config NO_IOPORT_MAP
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config LOCKDEP_SUPPORT
def_bool y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c22df590e7e7..c989d5b89e1f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,8 @@ config X86
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_ARCH_KGDB
select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_MMAP_RND_BITS
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
@@ -177,12 +179,23 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config SBUS
bool
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d3eee663c41f..61e706c9f710 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -162,11 +162,6 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
static inline int pmd_trans_huge(pmd_t pmd)
{
return pmd_val(pmd) & _PAGE_PSE;
@@ -808,10 +803,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index dd5b0aa9dd2f..116fc4ee586f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,7 +22,6 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
@@ -46,7 +45,6 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..22db575a2fec 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..3a5330213aca 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+ if (pmd_trans_huge(*pmd)) {
+ struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..f8cb3e8ac250 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -136,8 +136,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -158,18 +156,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
@@ -212,8 +199,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 844b06d67df4..647fecfca59f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- /*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
if (mmap_is_ia32())
- rnd = (unsigned long)get_random_int() % (1<<8);
+#ifdef CONFIG_COMPAT
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_compat_bits);
+#else
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_bits);
+#endif
else
- rnd = (unsigned long)get_random_int() % (1<<28);
+ rnd = (unsigned long)get_random_int() % (1 << mmap_rnd_bits);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ee9c2e3a7199..4eb287e25043 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
return young;
}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- int set;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
- (unsigned long *)pmdp);
- if (set) {
- /* need tlb flush only to serialize against gup-fast */
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
-}
#endif
/**
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index 5ece856c5725..35c822286bbe 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
page_mapcount(p));
if (!page_count(p))
rc |= TLB_INSANE;
- else if (page_mapped(p))
+ else if (page_mapcount(p))
rc |= TLB_SUSPICIOUS;
} else {
rc |= TLB_INSANE;
diff --git a/block/genhd.c b/block/genhd.c
index 78140b4eea4d..cd78712bab8e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,7 +852,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6682c5daf742..6e6bc1059301 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -32,6 +32,7 @@
#include <linux/hardirq.h>
#include <linux/pstore.h>
#include <linux/vmalloc.h>
+#include <linux/mm.h> /* kvfree() */
#include <acpi/apei.h>
#include "apei-internal.h"
@@ -532,10 +533,7 @@ retry:
return -ENOMEM;
memcpy(new_entries, entries,
erst_record_id_cache.len * sizeof(entries[0]));
- if (erst_record_id_cache.size < PAGE_SIZE)
- kfree(entries);
- else
- vfree(entries);
+ kvfree(entries);
erst_record_id_cache.entries = entries = new_entries;
erst_record_id_cache.size = new_size;
}
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 91bbb1959d8d..691eeea2f19a 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -200,7 +200,7 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
struct cpu_attr {
struct device_attribute attr;
- const struct cpumask *const * const map;
+ const struct cpumask *const map;
};
static ssize_t show_cpus_attr(struct device *dev,
@@ -209,7 +209,7 @@ static ssize_t show_cpus_attr(struct device *dev,
{
struct cpu_attr *ca = container_of(attr, struct cpu_attr, attr);
- return cpumap_print_to_pagebuf(true, buf, *ca->map);
+ return cpumap_print_to_pagebuf(true, buf, ca->map);
}
#define _CPU_ATTR(name, map) \
@@ -217,9 +217,9 @@ static ssize_t show_cpus_attr(struct device *dev,
/* Keep in sync with cpu_subsys_attrs */
static struct cpu_attr cpu_attrs[] = {
- _CPU_ATTR(online, &cpu_online_mask),
- _CPU_ATTR(possible, &cpu_possible_mask),
- _CPU_ATTR(present, &cpu_present_mask),
+ _CPU_ATTR(online, &__cpu_online_mask),
+ _CPU_ATTR(possible, &__cpu_possible_mask),
+ _CPU_ATTR(present, &__cpu_present_mask),
};
/*
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 0dabc9b93725..92d6fc020a65 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number)
}
}
-static void bm_vk_free(void *ptr, int v)
+static inline void bm_vk_free(void *ptr)
{
- if (v)
- vfree(ptr);
- else
- kfree(ptr);
+ kvfree(ptr);
}
/*
@@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
{
struct page **old_pages = b->bm_pages;
struct page **new_pages, *page;
- unsigned int i, bytes, vmalloced = 0;
+ unsigned int i, bytes;
unsigned long have = b->bm_number_of_pages;
BUG_ON(have == 0 && old_pages != NULL);
@@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
PAGE_KERNEL);
if (!new_pages)
return NULL;
- vmalloced = 1;
}
if (want >= have) {
@@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
if (!page) {
bm_free_pages(new_pages + have, i - have);
- bm_vk_free(new_pages, vmalloced);
+ bm_vk_free(new_pages);
return NULL;
}
/* we want to know which page it is
@@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
*/
}
- if (vmalloced)
- b->bm_flags |= BM_P_VMALLOCED;
- else
- b->bm_flags &= ~BM_P_VMALLOCED;
-
return new_pages;
}
@@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device)
if (!expect(device->bitmap))
return;
bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
- bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+ bm_vk_free(device->bitmap->bm_pages);
kfree(device->bitmap);
device->bitmap = NULL;
}
@@ -643,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
unsigned long want, have, onpages; /* number of pages */
struct page **npages, **opages = NULL;
int err = 0, growing;
- int opages_vmalloced;
if (!expect(b))
return -ENOMEM;
@@ -656,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
if (capacity == b->bm_dev_capacity)
goto out;
- opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
-
if (capacity == 0) {
spin_lock_irq(&b->bm_lock);
opages = b->bm_pages;
@@ -671,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
b->bm_dev_capacity = 0;
spin_unlock_irq(&b->bm_lock);
bm_free_pages(opages, onpages);
- bm_vk_free(opages, opages_vmalloced);
+ bm_vk_free(opages);
goto out;
}
bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
@@ -744,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
spin_unlock_irq(&b->bm_lock);
if (opages != npages)
- bm_vk_free(opages, opages_vmalloced);
+ bm_vk_free(opages);
if (!growing)
b->bm_set = bm_count_bits(b);
drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b6844feb9f9b..34bc84efc29e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -536,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */
/* definition of bits in bm_flags to be used in drbd_bm_lock
* and drbd_bitmap_io and friends. */
enum bm_flag {
- /* do we need to kfree, or vfree bm_pages? */
- BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
-
/* currently locked for bulk operation */
BM_LOCKED_MASK = 0xf,
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index f1d7fa45c275..f3f92d5fcda0 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -93,14 +93,11 @@ struct vma_data {
spinlock_t lock; /* Serialize access to this structure. */
int count; /* Number of pages allocated. */
enum mspec_page_type type; /* Type of pages allocated. */
- int flags; /* See VMD_xxx below. */
unsigned long vm_start; /* Original (unsplit) base. */
unsigned long vm_end; /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
};
-#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */
-
/* used on shub2 to clear FOP cache in the HUB */
static unsigned long scratch_page[MAX_NUMNODES];
#define SH2_AMO_CACHE_ENTRIES 4
@@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma)
"failed to zero page %ld\n", my_page);
}
- if (vdata->flags & VMD_VMALLOCED)
- vfree(vdata);
- else
- kfree(vdata);
+ kvfree(vdata);
}
/*
@@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
enum mspec_page_type type)
{
struct vma_data *vdata;
- int pages, vdata_size, flags = 0;
+ int pages, vdata_size;
if (vma->vm_pgoff != 0)
return -EINVAL;
@@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
if (vdata_size <= PAGE_SIZE)
vdata = kzalloc(vdata_size, GFP_KERNEL);
- else {
+ else
vdata = vzalloc(vdata_size);
- flags = VMD_VMALLOCED;
- }
if (!vdata)
return -ENOMEM;
vdata->vm_start = vma->vm_start;
vdata->vm_end = vma->vm_end;
- vdata->flags = flags;
vdata->type = type;
spin_lock_init(&vdata->lock);
atomic_set(&vdata->refcnt, 1);
diff --git a/drivers/firmware/broadcom/bcm47xx_nvram.c b/drivers/firmware/broadcom/bcm47xx_nvram.c
index e41594510b97..0c2f0a61b0ea 100644
--- a/drivers/firmware/broadcom/bcm47xx_nvram.c
+++ b/drivers/firmware/broadcom/bcm47xx_nvram.c
@@ -56,9 +56,7 @@ static u32 find_nvram_size(void __iomem *end)
static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
{
struct nvram_header __iomem *header;
- int i;
u32 off;
- u32 *src, *dst;
u32 size;
if (nvram_len) {
@@ -95,10 +93,7 @@ static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
return -ENXIO;
found:
- src = (u32 *)header;
- dst = (u32 *)nvram_buf;
- for (i = 0; i < sizeof(struct nvram_header); i += 4)
- *dst++ = __raw_readl(src++);
+ __ioread32_copy(nvram_buf, header, sizeof(*header) / 4);
header = (struct nvram_header *)nvram_buf;
nvram_len = header->len;
if (nvram_len > size) {
@@ -111,8 +106,8 @@ found:
nvram_len = NVRAM_SPACE - 1;
}
/* proceed reading data after header */
- for (; i < nvram_len; i += 4)
- *dst++ = readl(src++);
+ __ioread32_copy(nvram_buf + sizeof(*header), header + 1,
+ DIV_ROUND_UP(nvram_len, 4));
nvram_buf[NVRAM_SPACE - 1] = '\0';
return 0;
diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c
index c3b80fd65d62..7b30b307674b 100644
--- a/drivers/gpu/drm/drm_hashtab.c
+++ b/drivers/gpu/drm/drm_hashtab.c
@@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item);
void drm_ht_remove(struct drm_open_hash *ht)
{
if (ht->table) {
- if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order)
- kfree(ht->table);
- else
- vfree(ht->table);
+ kvfree(ht->table);
ht->table = NULL;
}
}
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 8966f85ca4cd..d42a8ff9e2e4 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -433,16 +433,15 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals)
scale_db = true;
case IIO_VAL_INT_PLUS_MICRO:
if (vals[1] < 0)
- return sprintf(buf, "-%ld.%06u%s\n", abs(vals[0]),
- -vals[1],
- scale_db ? " dB" : "");
+ return sprintf(buf, "-%d.%06u%s\n", abs(vals[0]),
+ -vals[1], scale_db ? " dB" : "");
else
return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1],
scale_db ? " dB" : "");
case IIO_VAL_INT_PLUS_NANO:
if (vals[1] < 0)
- return sprintf(buf, "-%ld.%09u\n", abs(vals[0]),
- -vals[1]);
+ return sprintf(buf, "-%d.%09u\n", abs(vals[0]),
+ -vals[1]);
else
return sprintf(buf, "%d.%09u\n", vals[0], vals[1]);
case IIO_VAL_FRACTIONAL:
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index 20e6aa910700..c148085742a0 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -901,7 +901,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
/* bound gain by 2 bits value max, 3rd bit is sign */
data->delta_gain_code[i] =
min(abs(delta_g),
- (long) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
+ (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
if (delta_g < 0)
/*
diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
index 86b598cff91a..498fd0581a45 100644
--- a/drivers/soc/qcom/smd.c
+++ b/drivers/soc/qcom/smd.c
@@ -434,20 +434,15 @@ static void smd_copy_to_fifo(void __iomem *dst,
/*
* Copy count bytes of data using 32bit accesses, if that is required.
*/
-static void smd_copy_from_fifo(void *_dst,
- const void __iomem *_src,
+static void smd_copy_from_fifo(void *dst,
+ const void __iomem *src,
size_t count,
bool word_aligned)
{
- u32 *dst = (u32 *)_dst;
- u32 *src = (u32 *)_src;
-
if (word_aligned) {
- count /= sizeof(u32);
- while (count--)
- *dst++ = __raw_readl(src++);
+ __ioread32_copy(dst, src, count / sizeof(u32));
} else {
- memcpy_fromio(_dst, _src, count);
+ memcpy_fromio(dst, src, count);
}
}
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
index d6273e143324..a80d993b882e 100644
--- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@ -151,16 +151,12 @@ do { \
#define LIBCFS_FREE(ptr, size) \
do { \
- int s = (size); \
if (unlikely((ptr) == NULL)) { \
CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \
- "%s:%d\n", s, __FILE__, __LINE__); \
+ "%s:%d\n", (int)(size), __FILE__, __LINE__); \
break; \
} \
- if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \
- vfree(ptr); \
- else \
- kfree(ptr); \
+ kvfree(ptr); \
} while (0)
/******************************************************************************/
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
index 7a9fafc67693..86c371ef71ea 100644
--- a/drivers/staging/lustre/lustre/llite/super25.c
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -106,7 +106,8 @@ static int __init init_lustre_lite(void)
rc = -ENOMEM;
ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
sizeof(struct ll_inode_info),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
+ NULL);
if (ll_inode_cachep == NULL)
goto out_cache;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index ea4aba56f29d..fadf408bdd46 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -44,24 +44,24 @@ struct adfs_dir_ops;
*/
struct adfs_sb_info {
union { struct {
- struct adfs_discmap *s_map; /* bh list containing map */
- const struct adfs_dir_ops *s_dir; /* directory operations */
+ struct adfs_discmap *s_map; /* bh list containing map */
+ const struct adfs_dir_ops *s_dir; /* directory operations */
};
- struct rcu_head rcu; /* used only at shutdown time */
+ struct rcu_head rcu; /* used only at shutdown time */
};
- kuid_t s_uid; /* owner uid */
- kgid_t s_gid; /* owner gid */
- umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
- umode_t s_other_mask; /* ADFS other perm -> unix perm */
+ kuid_t s_uid; /* owner uid */
+ kgid_t s_gid; /* owner gid */
+ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
+ umode_t s_other_mask; /* ADFS other perm -> unix perm */
int s_ftsuffix; /* ,xyz hex filetype suffix option */
- __u32 s_ids_per_zone; /* max. no ids in one zone */
- __u32 s_idlen; /* length of ID in map */
- __u32 s_map_size; /* sector size of a map */
- unsigned long s_size; /* total size (in blocks) of this fs */
- signed int s_map2blk; /* shift left by this for map->sector */
- unsigned int s_log2sharesize;/* log2 share size */
- __le32 s_version; /* disc format version */
+ __u32 s_ids_per_zone; /* max. no ids in one zone */
+ __u32 s_idlen; /* length of ID in map */
+ __u32 s_map_size; /* sector size of a map */
+ unsigned long s_size; /* total size (in blocks) of this fs */
+ signed int s_map2blk; /* shift left by this for map->sector*/
+ unsigned int s_log2sharesize;/* log2 share size */
+ __le32 s_version; /* disc format version */
unsigned int s_namelen; /* maximum number of characters in name */
};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5b50c4ca43a7..84a84fcb5f5a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..81afefe7d8a6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
afs_inode_cachep = kmem_cache_create("afs_inode_cache",
sizeof(struct afs_vnode),
0,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
afs_i_init_once);
if (!afs_inode_cachep) {
printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedacfa6a8..2a23edf5703e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
sizeof (struct befs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (befs_inode_cachep == NULL) {
pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c25639e907bd..c27362de0039 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -432,7 +432,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+ result = blk_queue_enter(bdev->bd_queue, GFP_NOIO);
if (result)
return result;
@@ -590,7 +590,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a70c5790f8f5..2c858a80b906 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9160,7 +9160,8 @@ int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..ca4d5e8457f1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
if (ceph_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cbc0f4bca0c0..c790423e98ee 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1032,7 +1032,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e82217c3..0a2752b79e72 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index f829fe963f5b..5104d84c4f64 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
} while (0)
-#define CODA_FREE(ptr,size) \
- do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
/* inode to cnode access functions */
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
- sizeof(struct coda_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct coda_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 670f7896945b..98e4e254018d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
@@ -3415,7 +3416,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e83f31cfd96c..83aa5aef93ee 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
+ unsigned long flags;
void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
+ .flags = SLAB_ACCOUNT,
.ctor = inode_info_init_once,
},
{
@@ -754,8 +756,8 @@ static int ecryptfs_init_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- *(info->cache) = kmem_cache_create(info->name, info->size,
- 0, SLAB_HWCACHE_ALIGN, info->ctor);
+ *(info->cache) = kmem_cache_create(info->name, info->size, 0,
+ SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
if (!*(info->cache)) {
ecryptfs_free_kmem_caches();
ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
- sizeof(struct efs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct efs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64daa0f55..60f03b78914e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
{
exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- exofs_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, exofs_init_once);
if (exofs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc902..2a188413a2b0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -203,7 +203,7 @@ static int __init init_inodecache(void)
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6e5a..f1b56ff01208 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -966,7 +966,7 @@ static int __init init_inodecache(void)
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3a65e0132352..862916c7e3f8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1424,8 +1424,9 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
- f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info));
+ f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 8b2127ffb226..7def96caec5f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..4307cd4f8da0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..43d3475da83a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,8 +14,12 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..94902b3c632c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -553,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
@@ -677,7 +738,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/file.c b/fs/file.c
index 1aed0add16a2..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+ void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+ __GFP_NOWARN | __GFP_NORETRY);
if (data != NULL)
return data;
}
- return vmalloc(size);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..4d69d5c0bedc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
int err;
fuse_inode_cachep = kmem_cache_create("fuse_inode",
- sizeof(struct fuse_inode),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 1d709d496364..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
goto fail;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
int err;
hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
- sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
- hfs_init_once);
+ sizeof(struct hfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
if (!hfs_inode_cachep)
return -ENOMEM;
err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..5d54490a136d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
int err;
hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
- HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+ HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2ac99db3750e..a4cf6b11a142 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de4bdfac0cec..9005485919a1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -738,7 +738,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
- * an empty rb tree and calls spin_lock_init(), later when we
+ * an empty rb tree and calls rwlock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
@@ -1321,7 +1321,7 @@ static int __init init_hugetlbfs_fs(void)
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
- 0, 0, init_once);
+ 0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
diff --git a/fs/inode.c b/fs/inode.c
index 1be5f9003eb3..59e55ee25d0a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1883,7 +1883,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f2a45d..9bc2431d2df8 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (isofs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a3750f902adc..0ae91ad6df2d 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
#include "nodelist.h"
static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
return 0;
out_free:
-#ifndef __ECOS
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
-#endif
- kfree(c->blocks);
+ kvfree(c->blocks);
return ret;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2caf1682036d..bead25ae8fe4 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
out_inohash:
jffs2_clear_xattr_subsystem(c);
kfree(c->inocache_list);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..0a9a114bb9d1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
jffs2_flash_cleanup(c);
kfree(c->inocache_list);
jffs2_clear_xattr_subsystem(c);
@@ -387,7 +384,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8f9176caf098..900925b5eb8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -898,7 +898,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 91e004518237..0239a0a76ed5 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- /*
- * If the ino of the sysfs entry created for a kmem cache gets
- * allocated from an ida layer, which is accounted to the memcg that
- * owns the cache, the memcg will get pinned forever. So do not account
- * ino ida allocations.
- */
- ret = ida_simple_get(&root->ino_ida, 1, 0,
- GFP_KERNEL | __GFP_NOACCOUNT);
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
if (ret < 0)
goto err_out2;
kn->ino = ret;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d6941a..5d65db2e03f4 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -408,7 +408,8 @@ const struct super_operations logfs_super_operations = {
int logfs_init_inode_cache(void)
{
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ sizeof(struct logfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
logfs_init_once);
if (!logfs_inode_cache)
return -ENOMEM;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a61e80..5942c3e10fa5 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f63549..d80446e1a333 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
sizeof(struct ncp_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ncp_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 31b0a52223a7..354a0ae7f5f1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1911,7 +1911,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5c0c6b58157f..9aebffb40505 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
return page;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 354013ea22ec..e97ed6be5ad9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1418,7 +1418,8 @@ static int __init nilfs_init_cachep(void)
{
nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
sizeof(struct nilfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ nilfs_inode_init_once);
if (!nilfs_inode_cachep)
goto fail;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
break;
}
spin_unlock(&next_i->i_lock);
- next_i = list_entry(next_i->i_sb_list.next,
- struct inode, i_sb_list);
+ next_i = list_next_entry(next_i, i_sb_list);
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..cfcbf114676e 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
#include "fsnotify.h"
struct srcu_struct fsnotify_mark_srcu;
-static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
atomic_dec(&group->num_marks);
}
+static void
+fsnotify_mark_free_rcu(struct rcu_head *rcu)
+{
+ struct fsnotify_mark *mark;
+
+ mark = container_of(rcu, struct fsnotify_mark, g_rcu);
+ fsnotify_put_mark(mark);
+}
+
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a call_srcu
+ * callback. Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
@@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
/*
* Some groups like to know that marks are being freed. This is a
@@ -385,11 +388,7 @@ err:
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
-
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
return ret;
}
@@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
-
-static int fsnotify_mark_destroy(void *ignored)
-{
- struct fsnotify_mark *mark, *next;
- struct list_head private_destroy_list;
-
- for (;;) {
- spin_lock(&destroy_lock);
- /* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
- spin_unlock(&destroy_lock);
-
- synchronize_srcu(&fsnotify_mark_srcu);
-
- list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
- list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
- }
-
- wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
- }
-
- return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
- struct task_struct *thread;
-
- thread = kthread_run(fsnotify_mark_destroy, NULL,
- "fsnotify_mark");
- if (IS_ERR(thread))
- panic("unable to start fsnotify mark destruction thread.");
-
- return 0;
-}
-device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..2f77f8dfb861 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- ntfs_big_inode_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..c2cb51d4f6e9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
@@ -6254,7 +6277,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..182f75449e08 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- set_buffer_new(bh_result);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
@@ -653,362 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ret = is_overwrite;
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
@@ -1195,6 +692,13 @@ next_bh:
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
+
/*
* Describe the state of a single cluster to be written to.
*/
@@ -1206,7 +710,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -1218,6 +722,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -1266,6 +773,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1304,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
@@ -1313,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -1328,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
@@ -1335,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -1395,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1530,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
@@ -1553,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1577,19 +1114,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1599,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1618,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1630,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
@@ -1700,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1772,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
}
/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
+}
+
+/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
*
@@ -1846,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -2016,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -2071,9 +1682,8 @@ out:
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
@@ -2090,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -2109,14 +1719,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2147,7 +1760,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
@@ -2177,17 +1790,17 @@ try_again:
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -2254,7 +1867,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
@@ -2265,7 +1879,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
@@ -2317,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2375,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2388,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
@@ -2418,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2446,7 +2073,8 @@ out:
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2471,6 +2099,356 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /*
+ * Because we need to change file size in ocfs2_dio_end_io_write(), or
+ * we may need to add it to orphan dir. So can not fall to fast path
+ * while file size will be changed.
+ */
+ if (pos + total_len <= i_size_read(inode)) {
+ down_read(&oi->ip_alloc_sem);
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ up_read(&oi->ip_alloc_sem);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+ }
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..7fc11896c440 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -88,11 +93,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
- set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
- clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
- test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 709fbbd44c65..a3cc6d2fc896 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..f90931335c6b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -262,6 +262,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +288,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +330,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..8caf88160942 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..03768bb3aab1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..f2a9c71d59c6 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
}
generic_fillattr(inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
@@ -1373,44 +1381,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -2121,18 +2091,12 @@ out:
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2181,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
@@ -2192,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2264,18 +2169,16 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
- int unaligned_dio = 0;
- int dropped_dio = 0;
+ void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
@@ -2288,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2326,7 +2227,6 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
@@ -2335,41 +2235,18 @@ relock:
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
-
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
- if (unaligned_dio) {
+ if (direct_io && !is_sync_kiocb(iocb) &&
+ ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
/*
- * Wait on previous unaligned aio to complete before
- * proceeding.
+ * Make it a sync io if it's an unaligned aio.
*/
- mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
- /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
- ocfs2_iocb_set_unaligned_aio(iocb);
+ saved_ki_complete = xchg(&iocb->ki_complete, NULL);
}
/* communicate with ocfs2_dio_end_io */
@@ -2390,14 +2267,13 @@ relock:
*/
if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- unaligned_dio = 0;
}
if (unlikely(written <= 0))
- goto no_sync;
+ goto out;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
@@ -2416,13 +2292,10 @@ relock:
iocb->ki_pos - 1);
}
-no_sync:
- if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
out:
+ if (saved_ki_complete)
+ xchg(&iocb->ki_complete, saved_ki_complete);
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05ee25d..0fd9ebdd3ed8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1125,6 +1125,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..6e0633eef36f 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
- /* Number of outstanding AIO's which are not page aligned */
- struct mutex ip_unaligned_aio;
-
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..af0c6b36cc66 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1327,7 +1327,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1972,7 +1972,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -2012,7 +2012,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..0fd767bd9b2f 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -281,7 +281,7 @@ bail:
return ret;
}
-int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+int ocfs2_load_local_alloc(struct ocfs2_super *osb, int check, int *recovery)
{
int status = 0;
struct ocfs2_dinode *alloc = NULL;
@@ -345,21 +345,26 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
if (num_used
|| alloc->id1.bitmap1.i_used
|| alloc->id1.bitmap1.i_total
- || la->la_bm_off)
+ || la->la_bm_off) {
mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
"found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+ status = -EINVAL;
+ *recovery = 1;
+ goto bail;
+ }
- osb->local_alloc_bh = alloc_bh;
- osb->local_alloc_state = OCFS2_LA_ENABLED;
+ if (!check) {
+ osb->local_alloc_bh = alloc_bh;
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ }
bail:
- if (status < 0)
+ if (status < 0 || check)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -387,7 +392,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -1087,7 +1092,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 44a7d1fb2dec..a9138414842a 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -26,7 +26,7 @@
#ifndef OCFS2_LOCALALLOC_H
#define OCFS2_LOCALALLOC_H
-int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+int ocfs2_load_local_alloc(struct ocfs2_super *osb, int check, int *recovery);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..a88707a0f4da 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..09d0c89a9daf 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220887..44df24b62fef 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d5da6f624142..10ad87ba02e0 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -187,7 +187,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..b7e2b16dedac 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -79,12 +79,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
@@ -1280,6 +1274,8 @@ static int ocfs2_parse_options(struct super_block *sb,
int status, user_stack = 0;
char *p;
u32 tmp;
+ int token, option;
+ substring_t args[MAX_OPT_ARGS];
trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
@@ -1298,9 +1294,6 @@ static int ocfs2_parse_options(struct super_block *sb,
}
while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
if (!*p)
continue;
@@ -1367,7 +1360,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = option;
break;
case Opt_slot:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1376,7 +1368,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->slot = (s16)option;
break;
case Opt_commit:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1388,7 +1379,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1612,33 +1602,25 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1649,11 +1631,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1744,8 +1721,8 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
- mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1771,7 +1748,7 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
@@ -2348,6 +2325,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
@@ -2428,6 +2411,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
int status;
int dirty;
int local;
+ int la_dirty = 0, recovery = 0;
struct ocfs2_dinode *local_alloc = NULL; /* only used if we
* recover
* ourselves. */
@@ -2449,6 +2433,16 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
if (!dirty) {
+ /* It may happen that local alloc is unclean shutdown, but
+ * journal has been marked clean, so check it here and do
+ * recovery if needed */
+ status = ocfs2_load_local_alloc(osb, 1, &recovery);
+ if (recovery) {
+ printk(KERN_NOTICE "ocfs2: local alloc needs recovery "
+ "on device (%s).\n", osb->dev_str);
+ la_dirty = 1;
+ }
+
status = ocfs2_journal_wipe(osb->journal, 0);
if (status < 0) {
mlog_errno(status);
@@ -2477,7 +2471,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
JBD2_FEATURE_COMPAT_CHECKSUM, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- if (dirty) {
+ if (dirty || la_dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
status = ocfs2_begin_local_alloc_recovery(osb,
osb->slot_num,
@@ -2490,13 +2484,13 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves as mounted. */
}
- status = ocfs2_load_local_alloc(osb);
+ status = ocfs2_load_local_alloc(osb, 0, &recovery);
if (status < 0) {
mlog_errno(status);
goto finally;
}
- if (dirty) {
+ if (dirty || la_dirty) {
/* Recovery will be completed after we've mounted the
* rest of the volume. */
osb->dirty = 1;
@@ -2535,6 +2529,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9fdebb0..561557122dea 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
init_once);
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..67aaaad4081e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -22,9 +23,13 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap, ptes, pmds;
+ unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -35,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
- hiwater_rss = total_rss = get_mm_rss(mm);
+ hiwater_rss = total_rss = anon + file + shmem;
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
@@ -52,6 +57,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
+ "RssAnon:\t%8lu kB\n"
+ "RssFile:\t%8lu kB\n"
+ "RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
@@ -65,6 +73,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
+ anon << (PAGE_SHIFT-10),
+ file << (PAGE_SHIFT-10),
+ shmem << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10,
@@ -82,7 +93,8 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES);
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
@@ -451,12 +463,14 @@ struct mem_size_stats {
unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
+ bool check_shmem_swap;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? HPAGE_PMD_NR : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
@@ -465,26 +479,53 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
+ }
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
}
}
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+
+ mss->swap += shmem_partial_swap_usage(
+ walk->vma->vm_file->f_mapping, addr, end);
+
+ return 0;
+}
+#endif
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -512,11 +553,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+ if (!page)
+ return;
+
+ if (radix_tree_exceptional_entry(page))
+ mss->swap += PAGE_SIZE;
+ else
+ page_cache_release(page);
+
+ return;
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -532,8 +587,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -549,7 +603,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -671,6 +725,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
};
memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ mss.swap = shmem_swapped;
+ } else {
+ mss.check_shmem_swap = true;
+ smaps_walk.pte_hole = smaps_pte_hole;
+ }
+ }
+#endif
+
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
@@ -838,7 +917,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -1112,7 +1191,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
@@ -1444,7 +1523,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb778886e..f761acdd5a7a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -364,7 +364,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a9774c..4f04f00a7e5e 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -624,7 +624,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..05db7473bcb5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733cda397..e1113399a6b4 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -618,8 +618,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe00df..ea59b475663c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -420,7 +420,8 @@ static int __init init_inodecache(void)
{
squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
sizeof(struct squashfs_inode_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
return squashfs_inode_cachep ? 0 : -ENOMEM;
}
diff --git a/fs/stat.c b/fs/stat.c
index d4a61d8dc021..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
#ifndef INIT_STRUCT_STAT_PADDING
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 02fa1dcc5969..463b5ff25c65 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -346,7 +346,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fd90c079537..a233ba913be4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2248,8 +2248,8 @@ static int __init ubifs_init(void)
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
- &inode_slab_ctor);
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT, &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..115136f55191 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
return -ENOMEM;
@@ -278,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
{
int i;
int nr_groups = bitmap->s_nr_groups;
- int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
- nr_groups);
for (i = 0; i < nr_groups; i++)
if (bitmap->s_block_bitmap[i])
brelse(bitmap->s_block_bitmap[i]);
- if (size <= PAGE_SIZE)
- kfree(bitmap);
- else
- vfree(bitmap);
+ kvfree(bitmap);
}
static void udf_free_partition(struct udf_part_map *map)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ufs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/xattr.c b/fs/xattr.c
index 9b932b95d74e..e71ba1f3957f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -324,7 +324,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
{
int error;
void *kvalue = NULL;
- void *vvalue = NULL; /* If non-NULL, we used vmalloc() */
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -341,10 +340,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
return -E2BIG;
kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
@@ -357,10 +355,7 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+ kvfree(kvalue);
return error;
}
@@ -428,7 +423,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
{
ssize_t error;
void *kvalue = NULL;
- void *vvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
@@ -442,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
size = XATTR_SIZE_MAX;
kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
}
@@ -461,10 +454,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+ kvfree(kvalue);
return error;
}
@@ -521,17 +511,15 @@ listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
- char *vlist = NULL; /* If non-NULL, we used vmalloc() */
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
if (!klist) {
- vlist = vmalloc(size);
- if (!vlist)
+ klist = vmalloc(size);
+ if (!klist)
return -ENOMEM;
- klist = vlist;
}
}
@@ -544,10 +532,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vlist)
- vfree(vlist);
- else
- kfree(klist);
+ kvfree(klist);
return error;
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 36bd8825bfb0..436b6ef6e4a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1714,8 +1714,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+ KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 4b4b056a6eb0..5148150cc80b 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -1,6 +1,8 @@
#ifndef __ASM_MEMORY_MODEL_H
#define __ASM_MEMORY_MODEL_H
+#include <linux/pfn.h>
+
#ifndef __ASSEMBLY__
#if defined(CONFIG_FLATMEM)
@@ -72,7 +74,7 @@
/*
* Convert a physical address to a Page Frame Number and back
*/
-#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
+#define __phys_to_pfn(paddr) PHYS_PFN(paddr)
#define __pfn_to_phys(pfn) PFN_PHYS(pfn)
#define page_to_pfn __page_to_pfn
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 14b0ff32fb9f..63abda1ac06d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -207,11 +207,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-#endif
-
#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -619,10 +614,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
{
return 0;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return 0;
-}
#ifndef __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b58fd667f87b..af0254c09424 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -4,6 +4,7 @@
/* References to section boundaries */
#include <linux/compiler.h>
+#include <linux/types.h>
/*
* Usage guidelines:
@@ -63,4 +64,68 @@ static inline int arch_is_kernel_data(unsigned long addr)
}
#endif
+/**
+ * memory_contains - checks if an object is contained within a memory region
+ * @begin: virtual address of the beginning of the memory region
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the memory region defined by @begin and @end, false
+ * otherwise.
+ */
+static inline bool memory_contains(void *begin, void *end, void *virt,
+ size_t size)
+{
+ return virt >= begin && virt + size <= end;
+}
+
+/**
+ * memory_intersects - checks if the region occupied by an object intersects
+ * with another memory region
+ * @begin: virtual address of the beginning of the memory regien
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the region specified by @begin and @end, false otherwise.
+ */
+static inline bool memory_intersects(void *begin, void *end, void *virt,
+ size_t size)
+{
+ void *vend = virt + size;
+
+ return (virt >= begin && virt < end) || (vend >= begin && vend < end);
+}
+
+/**
+ * init_section_contains - checks if an object is contained within the init
+ * section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the init section, false otherwise.
+ */
+static inline bool init_section_contains(void *virt, size_t size)
+{
+ return memory_contains(__init_begin, __init_end, virt, size);
+}
+
+/**
+ * init_section_intersects - checks if the region occupied by an object
+ * intersects with the init section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the init section, false otherwise.
+ */
+static inline bool init_section_intersects(void *virt, size_t size)
+{
+ return memory_intersects(__init_begin, __init_end, virt, size);
+}
+
#endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 59915ea5373c..fc14275ff34e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -85,10 +85,14 @@ extern int nr_cpu_ids;
* only one CPU.
*/
-extern const struct cpumask *const cpu_possible_mask;
-extern const struct cpumask *const cpu_online_mask;
-extern const struct cpumask *const cpu_present_mask;
-extern const struct cpumask *const cpu_active_mask;
+extern struct cpumask __cpu_possible_mask;
+extern struct cpumask __cpu_online_mask;
+extern struct cpumask __cpu_present_mask;
+extern struct cpumask __cpu_active_mask;
+#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
+#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask)
+#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask)
+#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask)
#if NR_CPUS > 1
#define num_online_cpus() cpumask_weight(cpu_online_mask)
@@ -716,14 +720,49 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask)
/* Wrappers for arch boot code to manipulate normally-constant masks */
-void set_cpu_possible(unsigned int cpu, bool possible);
-void set_cpu_present(unsigned int cpu, bool present);
-void set_cpu_online(unsigned int cpu, bool online);
-void set_cpu_active(unsigned int cpu, bool active);
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);
+static inline void
+set_cpu_possible(unsigned int cpu, bool possible)
+{
+ if (possible)
+ cpumask_set_cpu(cpu, &__cpu_possible_mask);
+ else
+ cpumask_clear_cpu(cpu, &__cpu_possible_mask);
+}
+
+static inline void
+set_cpu_present(unsigned int cpu, bool present)
+{
+ if (present)
+ cpumask_set_cpu(cpu, &__cpu_present_mask);
+ else
+ cpumask_clear_cpu(cpu, &__cpu_present_mask);
+}
+
+static inline void
+set_cpu_online(unsigned int cpu, bool online)
+{
+ if (online) {
+ cpumask_set_cpu(cpu, &__cpu_online_mask);
+ cpumask_set_cpu(cpu, &__cpu_active_mask);
+ } else {
+ cpumask_clear_cpu(cpu, &__cpu_online_mask);
+ }
+}
+
+static inline void
+set_cpu_active(unsigned int cpu, bool active)
+{
+ if (active)
+ cpumask_set_cpu(cpu, &__cpu_active_mask);
+ else
+ cpumask_clear_cpu(cpu, &__cpu_active_mask);
+}
+
+
/**
* to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
* @bitmap: the bitmap
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 533c4408529a..6b7e89f45aa4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -220,7 +220,10 @@ struct fsnotify_mark {
/* List of marks by group->i_fsnotify_marks. Also reused for queueing
* mark into destroy_list when it's waiting for the end of SRCU period
* before it can be freed. [group->mark_mutex] */
- struct list_head g_list;
+ union {
+ struct list_head g_list;
+ struct rcu_head g_rcu;
+ };
/* Protects inode / mnt pointers, flags, masks */
spinlock_t lock;
/* List of marks for inode / vfsmount [obj_lock] */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8942af0813e3..91f74e741aa2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,7 +30,7 @@ struct vm_area_struct;
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
#define ___GFP_ATOMIC 0x80000u
-#define ___GFP_NOACCOUNT 0x100000u
+#define ___GFP_ACCOUNT 0x100000u
#define ___GFP_NOTRACK 0x200000u
#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
@@ -73,11 +73,15 @@ struct vm_area_struct;
*
* __GFP_THISNODE forces the allocation to be satisified from the requested
* node with no fallbacks or placement policy enforcements.
+ *
+ * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant
+ * to kmem allocations).
*/
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
+#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
/*
* Watermark modifiers -- controls access to emergency reserves
@@ -104,7 +108,6 @@ struct vm_area_struct;
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
-#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT)
/*
* Reclaim modifiers
@@ -197,6 +200,9 @@ struct vm_area_struct;
* GFP_KERNEL is typical for kernel-internal allocations. The caller requires
* ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
*
+ * GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
+ * accounted to kmemcg.
+ *
* GFP_NOWAIT is for kernel allocations that should not stall for direct
* reclaim, start physical IO or use any filesystem callback.
*
@@ -236,6 +242,7 @@ struct vm_area_struct;
*/
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
#define GFP_NOIO (__GFP_RECLAIM)
#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
@@ -377,10 +384,11 @@ static inline enum zone_type gfp_zone(gfp_t flags)
static inline int gfp_zonelist(gfp_t flags)
{
- if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE))
- return 1;
-
- return 0;
+#ifdef CONFIG_NUMA
+ if (unlikely(flags & __GFP_THISNODE))
+ return ZONELIST_NOFALLBACK;
+#endif
+ return ZONELIST_FALLBACK;
}
/*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ecb080d6ff42..72cd942edb22 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -25,7 +25,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -48,17 +48,6 @@ enum transparent_hugepage_flag {
#endif
};
-enum page_check_address_pmd_flag {
- PAGE_CHECK_ADDRESS_PMD_FLAG,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
-extern pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl);
-
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -95,30 +84,27 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
#endif /* CONFIG_DEBUG_VM */
extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+
+int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list(page, NULL);
}
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd) \
+void deferred_split_huge_page(struct page *page);
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address);
+
+#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
- if (unlikely(pmd_trans_huge(*____pmd))) \
- __split_huge_page_pmd(__vma, __address, \
- ____pmd); \
+ if (pmd_trans_huge(*____pmd)) \
+ __split_huge_pmd(__vma, __pmd, __address); \
} while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { \
- pmd_t *____pmd = (__pmd); \
- anon_vma_lock_write(__anon_vma); \
- anon_vma_unlock_write(__anon_vma); \
- BUG_ON(pmd_trans_splitting(*____pmd) || \
- pmd_trans_huge(*____pmd)); \
- } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd);
+
#if HPAGE_PMD_ORDER >= MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
@@ -128,17 +114,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl);
/* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (pmd_trans_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma, ptl);
else
- return 0;
+ return false;
}
static inline int hpage_nr_pages(struct page *page)
{
@@ -183,11 +169,8 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-#define split_huge_page_pmd(__vma, __address, __pmd) \
- do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+static inline void deferred_split_huge_page(struct page *page) {}
+#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
@@ -201,10 +184,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
- return 0;
+ return false;
}
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 685c262e0be8..204c7f56f35a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -265,20 +265,18 @@ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
struct user_struct **user, int creat_flags,
int page_size_log);
-static inline int is_file_hugepages(struct file *file)
+static inline bool is_file_hugepages(struct file *file)
{
if (file->f_op == &hugetlbfs_file_operations)
- return 1;
- if (is_file_shm_hugepages(file))
- return 1;
+ return true;
- return 0;
+ return is_file_shm_hugepages(file);
}
#else /* !CONFIG_HUGETLBFS */
-#define is_file_hugepages(file) 0
+#define is_file_hugepages(file) false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
struct user_struct **user, int creat_flags,
diff --git a/include/linux/io.h b/include/linux/io.h
index de64c1e53612..72c35e0a41d1 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -29,6 +29,7 @@ struct device;
struct resource;
__visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+void __ioread32_copy(void *to, const void __iomem *from, size_t count);
void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
#ifdef CONFIG_MMU
diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
index 052c7b32cc91..8e9e288b08c1 100644
--- a/include/linux/kdev_t.h
+++ b/include/linux/kdev_t.h
@@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val)
return MKDEV((val >> 8) & 255, val & 255);
}
-static inline bool new_valid_dev(dev_t dev)
-{
- return 1;
-}
-
static inline u32 new_encode_dev(dev_t dev)
{
unsigned major = MAJOR(dev);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 350dfb08aee3..ae6adf894be9 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -202,26 +202,26 @@ extern int _cond_resched(void);
/**
* abs - return absolute value of an argument
- * @x: the value. If it is unsigned type, it is converted to signed type first
- * (s64, long or int depending on its size).
+ * @x: the value. If it is unsigned type, it is converted to signed type first.
+ * char is treated as if it was signed (regardless of whether it really is)
+ * but the macro's return type is preserved as char.
*
- * Return: an absolute value of x. If x is 64-bit, macro's return type is s64,
- * otherwise it is signed long.
+ * Return: an absolute value of x.
*/
-#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({ \
- s64 __x = (x); \
- (__x < 0) ? -__x : __x; \
- }), ({ \
- long ret; \
- if (sizeof(x) == sizeof(long)) { \
- long __x = (x); \
- ret = (__x < 0) ? -__x : __x; \
- } else { \
- int __x = (x); \
- ret = (__x < 0) ? -__x : __x; \
- } \
- ret; \
- }))
+#define abs(x) __abs_choose_expr(x, long long, \
+ __abs_choose_expr(x, long, \
+ __abs_choose_expr(x, int, \
+ __abs_choose_expr(x, short, \
+ __abs_choose_expr(x, char, \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(x), char), \
+ (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
+ ((void)0)))))))
+
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(x), signed type) || \
+ __builtin_types_compatible_p(typeof(x), unsigned type), \
+ ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
/**
* reciprocal_scale - "scale" a value into range [0, ep_ro)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d140b1e9faa7..2da38f093391 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 4356686b0a39..6b784c59f321 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -9,8 +9,8 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *))
-#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *))
+#define LZ4_MEM_COMPRESS (16384)
+#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *)))
/*
* lz4_compressbound()
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 24daf8fc4d7c..a25cce949f9b 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -318,9 +318,9 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
-int memblock_is_memory(phys_addr_t addr);
+bool memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
-int memblock_is_reserved(phys_addr_t addr);
+bool memblock_is_reserved(phys_addr_t addr);
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cd0e2413c358..9d5472be551b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,22 +85,12 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
};
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
- /* Currently active and new sockets should be assigned to cgroups */
- MEMCG_SOCK_ACTIVE,
- /* It was ever activated; we must disarm static keys on destruction */
- MEMCG_SOCK_ACTIVATED,
-};
-
struct cg_proto {
struct page_counter memory_allocated; /* Current allocated memory. */
struct percpu_counter sockets_allocated; /* Current number of sockets. */
int memory_pressure;
+ bool active;
long sysctl_mem[3];
- unsigned long flags;
/*
* memcg field is used to find which memcg we belong directly
* Each memcg struct can hold more than one cg_proto, so container_of
@@ -294,10 +284,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp);
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound);
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+ bool lrucare, bool compound);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -513,7 +505,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask,
- struct mem_cgroup **memcgp)
+ struct mem_cgroup **memcgp,
+ bool compound)
{
*memcgp = NULL;
return 0;
@@ -521,12 +514,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
static inline void mem_cgroup_commit_charge(struct page *page,
struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
}
static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg,
+ bool compound)
{
}
@@ -766,15 +760,13 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
-static inline bool __memcg_kmem_bypass(gfp_t gfp)
+static inline bool __memcg_kmem_bypass(void)
{
if (!memcg_kmem_enabled())
return true;
- if (gfp & __GFP_NOACCOUNT)
- return true;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return true;
return false;
@@ -791,7 +783,9 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp)
static __always_inline int memcg_kmem_charge(struct page *page,
gfp_t gfp, int order)
{
- if (__memcg_kmem_bypass(gfp))
+ if (__memcg_kmem_bypass())
+ return 0;
+ if (!(gfp & __GFP_ACCOUNT))
return 0;
return __memcg_kmem_charge(page, gfp, order);
}
@@ -810,16 +804,15 @@ static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
/**
* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
* @cachep: the original global kmem cache
- * @gfp: allocation flags.
*
* All memory allocated from a per-memcg cache is charged to the owner memcg.
*/
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
- if (__memcg_kmem_bypass(gfp))
+ if (__memcg_kmem_bypass())
return cachep;
- return __memcg_kmem_get_cache(cachep);
+ return __memcg_kmem_get_cache(cachep, gfp);
}
static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3d385c81c153..ed57b8b8ee78 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -9,7 +9,7 @@
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/spinlock.h>
+#include <linux/rwlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>
@@ -122,7 +122,7 @@ struct sp_node {
struct shared_policy {
struct rb_root root;
- spinlock_t lock;
+ rwlock_t lock;
};
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index cac1c0904d5f..ab92a8cf4c93 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,9 +23,13 @@ enum migrate_reason {
MR_SYSCALL, /* also applies to cpusets */
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
- MR_CMA
+ MR_CMA,
+ MR_TYPES
};
+/* In mm/migrate.c; also keep sync with include/trace/events/migrate.h */
+extern char * migrate_reason_names[MR_TYPES];
+
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad7793788..d988dd725b5c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -51,6 +51,17 @@ extern int sysctl_legacy_va_layout;
#define sysctl_legacy_va_layout 0
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+extern int mmap_rnd_bits_min;
+extern int mmap_rnd_bits_max;
+extern int mmap_rnd_bits;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+extern int mmap_rnd_compat_bits_min;
+extern int mmap_rnd_compat_bits_max;
+extern int mmap_rnd_compat_bits;
+#endif
+
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -395,39 +406,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
extern void kvfree(const void *addr);
-static inline void compound_lock(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
+ return &page[1].compound_mapcount;
}
-static inline void compound_unlock(struct page *page)
+static inline int compound_mapcount(struct page *page)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline unsigned long compound_lock_irqsave(struct page *page)
-{
- unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- local_irq_save(flags);
- compound_lock(page);
-#endif
- return flags;
-}
-
-static inline void compound_unlock_irqrestore(struct page *page,
- unsigned long flags)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- compound_unlock(page);
- local_irq_restore(flags);
-#endif
+ if (!PageCompound(page))
+ return 0;
+ page = compound_head(page);
+ return atomic_read(compound_mapcount_ptr(page)) + 1;
}
/*
@@ -440,55 +429,34 @@ static inline void page_mapcount_reset(struct page *page)
atomic_set(&(page)->_mapcount, -1);
}
+int __page_mapcount(struct page *page);
+
static inline int page_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(PageSlab(page), page);
- return atomic_read(&page->_mapcount) + 1;
-}
-static inline int page_count(struct page *page)
-{
- return atomic_read(&compound_head(page)->_count);
-}
-
-static inline bool __compound_tail_refcounted(struct page *page)
-{
- return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
+ if (unlikely(PageCompound(page)))
+ return __page_mapcount(page);
+ return atomic_read(&page->_mapcount) + 1;
}
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int total_mapcount(struct page *page);
+#else
+static inline int total_mapcount(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- return __compound_tail_refcounted(page);
+ return page_mapcount(page);
}
+#endif
-static inline void get_huge_page_tail(struct page *page)
+static inline int page_count(struct page *page)
{
- /*
- * __split_huge_page_refcount() cannot run from under us.
- */
- VM_BUG_ON_PAGE(!PageTail(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- if (compound_tail_refcounted(compound_head(page)))
- atomic_inc(&page->_mapcount);
+ return atomic_read(&compound_head(page)->_count);
}
-extern bool __get_page_tail(struct page *page);
-
static inline void get_page(struct page *page)
{
- if (unlikely(PageTail(page)))
- if (likely(__get_page_tail(page)))
- return;
+ page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
@@ -513,7 +481,15 @@ static inline void init_page_count(struct page *page)
atomic_set(&page->_count, 1);
}
-void put_page(struct page *page);
+void __put_page(struct page *page);
+
+static inline void put_page(struct page *page)
+{
+ page = compound_head(page);
+ if (put_page_testzero(page))
+ __put_page(page);
+}
+
void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
@@ -533,6 +509,9 @@ enum compound_dtor_id {
#ifdef CONFIG_HUGETLB_PAGE
HUGETLB_PAGE_DTOR,
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ TRANSHUGE_PAGE_DTOR,
+#endif
NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[];
@@ -562,6 +541,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
page[1].compound_order = order;
}
+void free_compound_page(struct page *page);
+
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
@@ -978,10 +959,21 @@ static inline pgoff_t page_file_index(struct page *page)
/*
* Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
*/
-static inline int page_mapped(struct page *page)
-{
- return atomic_read(&(page)->_mapcount) >= 0;
+static inline bool page_mapped(struct page *page)
+{
+ int i;
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
}
/*
@@ -1361,10 +1353,26 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
atomic_long_dec(&mm->rss_stat.count[member]);
}
+/* Optimized variant when page is already known not to be PageAnon */
+static inline int mm_counter_file(struct page *page)
+{
+ if (PageSwapBacked(page))
+ return MM_SHMEMPAGES;
+ return MM_FILEPAGES;
+}
+
+static inline int mm_counter(struct page *page)
+{
+ if (PageAnon(page))
+ return MM_ANONPAGES;
+ return mm_counter_file(page);
+}
+
static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
return get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_ANONPAGES);
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
}
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
@@ -2222,7 +2230,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
extern int get_hwpoison_page(struct page *page);
-extern void put_hwpoison_page(struct page *page);
+#define put_hwpoison_page(page) put_page(page)
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492a114f..22beb225e88f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,8 @@ struct page {
* see PAGE_MAPPING_ANON below.
*/
void *s_mem; /* slab first object */
+ atomic_t compound_mapcount; /* first tail page */
+ /* page_deferred_list().next -- second tail page */
};
/* Second double word */
@@ -61,6 +63,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+ /* page_deferred_list().prev -- second tail page */
};
union {
@@ -81,20 +84,9 @@ struct page {
union {
/*
- * Count of ptes mapped in
- * mms, to show when page is
- * mapped & limit reverse map
- * searches.
- *
- * Used also for tail pages
- * refcounting instead of
- * _count. Tail pages cannot
- * be mapped and keeping the
- * tail page _count zero at
- * all times guarantees
- * get_page_unless_zero() will
- * never succeed on tail
- * pages.
+ * Count of ptes mapped in mms, to show
+ * when page is mapped & limit reverse
+ * map searches.
*/
atomic_t _mapcount;
@@ -369,9 +361,10 @@ struct core_state {
};
enum {
- MM_FILEPAGES,
- MM_ANONPAGES,
- MM_SWAPENTS,
+ MM_FILEPAGES, /* Resident file mapping pages */
+ MM_ANONPAGES, /* Resident anonymous pages */
+ MM_SWAPENTS, /* Anonymous swap entries */
+ MM_SHMEMPAGES, /* Resident shared memory pages */
NR_MM_COUNTERS
};
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 877ef226f90f..3b77fab7ad28 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -10,6 +10,7 @@ struct mm_struct;
extern void dump_page(struct page *page, const char *reason);
extern void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags);
+extern void dump_gfpflag_names(unsigned long gfp_flags);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
@@ -55,4 +56,10 @@ void dump_mm(const struct mm_struct *mm);
#define VIRTUAL_BUG_ON(cond) do { } while (0)
#endif
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
+
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e23a9e704536..2bfad1803a65 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,9 @@ enum {
MIGRATE_TYPES
};
+/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
+extern char * const migratetype_names[MIGRATE_TYPES];
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#else
@@ -195,11 +198,6 @@ static inline int is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
-static inline int is_unevictable_lru(enum lru_list lru)
-{
- return (lru == LRU_UNEVICTABLE);
-}
-
struct zone_reclaim_stat {
/*
* The pageout code in vmscan.c keeps track of how many of the
@@ -576,19 +574,17 @@ static inline bool zone_is_empty(struct zone *zone)
/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
+enum {
+ ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
-
-/*
- * The NUMA zonelists are doubled because we need zonelists that restrict the
- * allocations to a single node for __GFP_THISNODE.
- *
- * [0] : Zonelist with fallback
- * [1] : No fallback (__GFP_THISNODE)
- */
-#define MAX_ZONELISTS 2
-#else
-#define MAX_ZONELISTS 1
+ /*
+ * The NUMA zonelists are doubled because we need zonelists that
+ * restrict the allocations to a single node for __GFP_THISNODE.
+ */
+ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
+ MAX_ZONELISTS
+};
/*
* This struct contains information about a zone in a zonelist. It is stored
@@ -1207,13 +1203,13 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
* the zone and PFN linkages are still valid. This is expensive, but walkers
* of the full memmap are extremely rare.
*/
-int memmap_valid_within(unsigned long pfn,
+bool memmap_valid_within(unsigned long pfn,
struct page *page, struct zone *zone);
#else
-static inline int memmap_valid_within(unsigned long pfn,
+static inline bool memmap_valid_within(unsigned long pfn,
struct page *page, struct zone *zone)
{
- return 1;
+ return true;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bb53c7b86315..19724e6ebd26 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,9 +101,6 @@ enum pageflags {
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- PG_compound_lock,
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
@@ -129,53 +126,104 @@ enum pageflags {
/* SLOB */
PG_slob_free = PG_private,
+
+ /* Compound pages. Stored in first tail page's flags */
+ PG_double_map = PG_private_2,
};
#ifndef __GENERATING_BOUNDS_H
+struct page; /* forward declaration */
+
+static inline struct page *compound_head(struct page *page)
+{
+ unsigned long head = READ_ONCE(page->compound_head);
+
+ if (unlikely(head & 1))
+ return (struct page *) (head - 1);
+ return page;
+}
+
+static inline int PageTail(struct page *page)
+{
+ return READ_ONCE(page->compound_head) & 1;
+}
+
+static inline int PageCompound(struct page *page)
+{
+ return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ * the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ * for compound page all operations related to the page flag applied to
+ * head page.
+ *
+ * PF_NO_TAIL:
+ * modifications of the page flag must be done on small or head pages,
+ * checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ * the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
+ compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
-
-#define __TESTCLEARFLAG(uname, lname) \
-static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -195,56 +243,62 @@ static inline int TestSetPage##uname(struct page *page) { return 0; }
#define TESTCLEARFLAG_FALSE(uname) \
static inline int TestClearPage##uname(struct page *page) { return 0; }
-#define __TESTCLEARFLAG_FALSE(uname) \
-static inline int __TestClearPage##uname(struct page *page) { return 0; }
-
#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \
SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
+ TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -257,31 +311,33 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
- TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
+ TESTSCFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
@@ -289,10 +345,10 @@ PAGEFLAG_FALSE(HWPoison)
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
-SETPAGEFLAG(Young, young)
-TESTCLEARFLAG(Young, young)
-PAGEFLAG(Idle, idle)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
#endif
/*
@@ -317,6 +373,7 @@ PAGEFLAG(Idle, idle)
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
@@ -329,6 +386,7 @@ static inline int PageAnon(struct page *page)
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
@@ -340,8 +398,9 @@ u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -358,22 +417,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -393,12 +454,7 @@ static inline void set_page_writeback_keepwrite(struct page *page)
test_set_page_writeback_keepwrite(page);
}
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-
-static inline int PageTail(struct page *page)
-{
- return READ_ONCE(page->compound_head) & 1;
-}
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
static inline void set_compound_head(struct page *page, struct page *head)
{
@@ -410,20 +466,6 @@ static inline void clear_compound_head(struct page *page)
WRITE_ONCE(page->compound_head, 0);
}
-static inline struct page *compound_head(struct page *page)
-{
- unsigned long head = READ_ONCE(page->compound_head);
-
- if (unlikely(head & 1))
- return (struct page *) (head - 1);
- return page;
-}
-
-static inline int PageCompound(struct page *page)
-{
- return PageHead(page) || PageTail(page);
-
-}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
@@ -484,22 +526,43 @@ static inline int PageTransTail(struct page *page)
return PageTail(page);
}
-#else
-
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
+ * as PMDs.
+ *
+ * This is required for optimization of rmap operations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
{
- return 0;
+ return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_set_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_clear_bit(PG_double_map, &page[1].flags);
}
+
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+ TESTSETFLAG_FALSE(DoubleMap)
+ TESTCLEARFLAG_FALSE(DoubleMap)
#endif
/*
@@ -583,12 +646,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
#define __PG_MLOCKED 0
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK 0
-#endif
-
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
@@ -598,8 +655,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1 << PG_private | 1 << PG_private_2 | \
1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED | \
- __PG_COMPOUND_LOCK)
+ 1 << PG_unevictable | __PG_MLOCKED)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -626,6 +682,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 17f118a82854..e1fe7cf5bddf 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -45,6 +45,7 @@ struct page_ext {
unsigned int order;
gfp_t gfp_mask;
unsigned int nr_entries;
+ int last_migrate_reason;
unsigned long trace_entries[8];
#endif
};
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index cacaabea8a09..46f1b939948c 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -1,38 +1,54 @@
#ifndef __LINUX_PAGE_OWNER_H
#define __LINUX_PAGE_OWNER_H
+#include <linux/jump_label.h>
+
#ifdef CONFIG_PAGE_OWNER
-extern bool page_owner_inited;
+extern struct static_key_false page_owner_inited;
extern struct page_ext_operations page_owner_ops;
extern void __reset_page_owner(struct page *page, unsigned int order);
extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __set_page_owner_migrate_reason(struct page *page, int reason);
+extern void __dump_page_owner(struct page *page);
static inline void reset_page_owner(struct page *page, unsigned int order)
{
- if (likely(!page_owner_inited))
- return;
-
- __reset_page_owner(page, order);
+ if (static_branch_unlikely(&page_owner_inited))
+ __reset_page_owner(page, order);
}
static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
{
- if (likely(!page_owner_inited))
- return;
-
- __set_page_owner(page, order, gfp_mask);
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner(page, order, gfp_mask);
}
static inline gfp_t get_page_owner_gfp(struct page *page)
{
- if (likely(!page_owner_inited))
+ if (static_branch_unlikely(&page_owner_inited))
+ return __get_page_owner_gfp(page);
+ else
return 0;
-
- return __get_page_owner_gfp(page);
+}
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __copy_page_owner(oldpage, newpage);
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner_migrate_reason(page, reason);
+}
+static inline void dump_page_owner(struct page *page)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __dump_page_owner(page);
}
#else
static inline void reset_page_owner(struct page *page, unsigned int order)
@@ -46,6 +62,14 @@ static inline gfp_t get_page_owner_gfp(struct page *page)
{
return 0;
}
-
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+}
+static inline void dump_page_owner(struct page *page)
+{
+}
#endif /* CONFIG_PAGE_OWNER */
#endif /* __LINUX_PAGE_OWNER_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 26eabf5ec718..4d08b6c33557 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
+ pgoff_t pgoff;
+
if (unlikely(PageHeadHuge(page)))
return page->index << compound_order(page);
- else
+
+ if (likely(!PageTransTail(page)))
return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ /*
+ * We don't initialize ->index for tail pages: calculate based on
+ * head page
+ */
+ pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff += page - compound_head(page);
+ return pgoff;
}
/*
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 7646637221f3..97f3e88aead4 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -9,5 +9,6 @@
#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT)
+#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
#endif
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 317e16de09e5..4a27153574e2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -27,11 +27,15 @@
* Magic number "tsta" to indicate a static timer initializer
* for the object debugging code.
*/
-#define TIMER_ENTRY_STATIC ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 29446aeef36e..77d1ba57d495 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -161,25 +161,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
struct anon_vma *page_get_anon_vma(struct page *page);
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
+
/*
* rmap interfaces called when adding or removing pte of page
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
{
- atomic_inc(&page->_mapcount);
+ atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}
/*
@@ -210,6 +216,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
}
/*
+ * Used by idle page tracking to check if a page was referenced via page
+ * tables.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+ unsigned long address, pmd_t **pmdp,
+ pte_t **ptep, spinlock_t **ptlp);
+#else
+static inline bool page_check_address_transhuge(struct page *page,
+ struct mm_struct *mm, unsigned long address,
+ pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
+{
+ *ptep = page_check_address(page, mm, address, ptlp, 0);
+ *pmdp = NULL;
+ return !!*ptep;
+}
+#endif
+
+/*
* Used by swapoff to help locate where page is expected in vma.
*/
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 6fb801686ad6..04e881829625 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -52,7 +52,7 @@ struct sysv_shm {
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
unsigned long shmlba);
-int is_file_shm_hugepages(struct file *file);
+bool is_file_shm_hugepages(struct file *file);
void exit_shm(struct task_struct *task);
#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
#else
@@ -66,9 +66,9 @@ static inline long do_shmat(int shmid, char __user *shmaddr,
{
return -ENOSYS;
}
-static inline int is_file_shm_hugepages(struct file *file)
+static inline bool is_file_shm_hugepages(struct file *file)
{
- return 0;
+ return false;
}
static inline void exit_shm(struct task_struct *task)
{
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 50777b5b1e4c..a43f41cb3c43 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -60,6 +60,10 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(swp_entry_t entry, struct page *page);
+extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+ pgoff_t start, pgoff_t end);
+
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
{
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2037a861e367..3ffee7422012 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,6 +86,11 @@
#else
# define SLAB_FAILSLAB 0x00000000UL
#endif
+#ifdef CONFIG_MEMCG_KMEM
+# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */
+#else
+# define SLAB_ACCOUNT 0x00000000UL
+#endif
/* The following flags affect the page allocator grouping pages by mobility */
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 9ef42e1f0b3a..3cc9632dcc2a 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -102,7 +102,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
* grabbing every spinlock (and more). So the "read" side to such a
* lock is anything which disables preemption.
*/
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
+#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
/**
* stop_machine: freeze the machine on all CPUs and run this function
@@ -121,7 +121,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus);
-#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
static inline int stop_machine(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
@@ -140,5 +140,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
return stop_machine(fn, data, cpus);
}
-#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
#endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/string.h b/include/linux/string.h
index 9ef7795e65e4..cc77d6477fc2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -121,6 +121,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7ba7dccaf0e7..457181844b6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -154,7 +154,7 @@ enum {
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
};
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
@@ -539,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-#define reuse_swap_page(page) (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+ (!PageTransCompound(page) && page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
{
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..b4c2a485b28a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,9 +56,10 @@ extern long do_no_restart_syscall(struct restart_block *parm);
#ifdef __KERNEL__
#ifdef CONFIG_DEBUG_STACK_USAGE
-# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
+ __GFP_ZERO)
#else
-# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK)
+# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
#endif
/*
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e623d392db0c..e1f8c993e73b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -68,7 +68,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_FAULT_FALLBACK,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
- THP_SPLIT,
+ THP_SPLIT_PAGE,
+ THP_SPLIT_PAGE_FAILED,
+ THP_SPLIT_PMD,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3bff87a25a42..d1f1d338af20 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -14,7 +14,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */
#define VM_ALLOC 0x00000002 /* vmalloc() */
#define VM_MAP 0x00000004 /* vmap()ed pages */
#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */
-#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index dde6bf092c8a..3d580fd4079a 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -8,8 +8,8 @@
*
* Thus most bits set go first.
*/
-#define show_gfp_flags(flags) \
- (flags) ? __print_flags(flags, "|", \
+
+#define __def_gfpflag_names \
{(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
{(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
{(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
@@ -19,9 +19,13 @@
{(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
{(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
{(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
+ {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \
+ {(unsigned long)__GFP_DMA, "GFP_DMA"}, \
+ {(unsigned long)__GFP_DMA32, "GFP_DMA32"}, \
{(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
{(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \
{(unsigned long)__GFP_IO, "GFP_IO"}, \
+ {(unsigned long)__GFP_FS, "GFP_FS"}, \
{(unsigned long)__GFP_COLD, "GFP_COLD"}, \
{(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
{(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
@@ -36,8 +40,12 @@
{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
+ {(unsigned long)__GFP_WRITE, "GFP_WRITE"}, \
{(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \
{(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \
{(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
- ) : "GFP_NOWAIT"
+#define show_gfp_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_gfpflag_names \
+ ) : "none"
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
new file mode 100644
index 000000000000..bfcf4a16aa94
--- /dev/null
+++ b/include/trace/events/huge_memory.h
@@ -0,0 +1,166 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM huge_memory
+
+#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HUGE_MEMORY_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/events/gfpflags.h>
+
+#define SCAN_STATUS \
+ EM( SCAN_FAIL, "failed") \
+ EM( SCAN_SUCCEED, "succeeded") \
+ EM( SCAN_PMD_NULL, "pmd_null") \
+ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
+ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
+ EM( SCAN_PAGE_RO, "no_writable_page") \
+ EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \
+ EM( SCAN_PAGE_NULL, "page_null") \
+ EM( SCAN_SCAN_ABORT, "scan_aborted") \
+ EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
+ EM( SCAN_PAGE_LRU, "page_not_in_lru") \
+ EM( SCAN_PAGE_LOCK, "page_locked") \
+ EM( SCAN_PAGE_ANON, "page_not_anon") \
+ EM( SCAN_PAGE_COMPOUND, "page_compound") \
+ EM( SCAN_ANY_PROCESS, "no_process_for_page") \
+ EM( SCAN_VMA_NULL, "vma_null") \
+ EM( SCAN_VMA_CHECK, "vma_check_failed") \
+ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
+ EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
+ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
+ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte")
+
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+SCAN_STATUS
+
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(mm_khugepaged_scan_pmd,
+
+ TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
+ bool referenced, int none_or_zero, int status, int unmapped),
+
+ TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, pfn)
+ __field(bool, writable)
+ __field(bool, referenced)
+ __field(int, none_or_zero)
+ __field(int, status)
+ __field(int, unmapped)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->pfn = page ? page_to_pfn(page) : -1;
+ __entry->writable = writable;
+ __entry->referenced = referenced;
+ __entry->none_or_zero = none_or_zero;
+ __entry->status = status;
+ __entry->unmapped = unmapped;
+ ),
+
+ TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
+ __entry->mm,
+ __entry->pfn,
+ __entry->writable,
+ __entry->referenced,
+ __entry->none_or_zero,
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->unmapped)
+);
+
+TRACE_EVENT(mm_collapse_huge_page,
+
+ TP_PROTO(struct mm_struct *mm, int isolated, int status),
+
+ TP_ARGS(mm, isolated, status),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, isolated)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->isolated = isolated;
+ __entry->status = status;
+ ),
+
+ TP_printk("mm=%p, isolated=%d, status=%s",
+ __entry->mm,
+ __entry->isolated,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_isolate,
+
+ TP_PROTO(struct page *page, int none_or_zero,
+ bool referenced, bool writable, int status),
+
+ TP_ARGS(page, none_or_zero, referenced, writable, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(int, none_or_zero)
+ __field(bool, referenced)
+ __field(bool, writable)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page ? page_to_pfn(page) : -1;
+ __entry->none_or_zero = none_or_zero;
+ __entry->referenced = referenced;
+ __entry->writable = writable;
+ __entry->status = status;
+ ),
+
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
+ __entry->pfn,
+ __entry->none_or_zero,
+ __entry->referenced,
+ __entry->writable,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+ TP_ARGS(mm, swapped_in, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swapped_in)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swapped_in = swapped_in;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swapped_in=%d, ret=%d",
+ __entry->mm,
+ __entry->swapped_in,
+ __entry->ret)
+);
+
+#endif /* __HUGE_MEMORY_H */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/events/page_isolation.h b/include/trace/events/page_isolation.h
new file mode 100644
index 000000000000..6fb644029c80
--- /dev/null
+++ b/include/trace/events/page_isolation.h
@@ -0,0 +1,38 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_isolation
+
+#if !defined(_TRACE_PAGE_ISOLATION_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_ISOLATION_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(test_pages_isolated,
+
+ TP_PROTO(
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long fin_pfn),
+
+ TP_ARGS(start_pfn, end_pfn, fin_pfn),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start_pfn)
+ __field(unsigned long, end_pfn)
+ __field(unsigned long, fin_pfn)
+ ),
+
+ TP_fast_assign(
+ __entry->start_pfn = start_pfn;
+ __entry->end_pfn = end_pfn;
+ __entry->fin_pfn = fin_pfn;
+ ),
+
+ TP_printk("start_pfn=0x%lx end_pfn=0x%lx fin_pfn=0x%lx ret=%s",
+ __entry->start_pfn, __entry->end_pfn, __entry->fin_pfn,
+ __entry->end_pfn == __entry->fin_pfn ? "success" : "fail")
+);
+
+#endif /* _TRACE_PAGE_ISOLATION_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f66476b96264..31763dd8db1c 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -330,10 +330,9 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
TRACE_EVENT(mm_vmscan_writepage,
- TP_PROTO(struct page *page,
- int reclaim_flags),
+ TP_PROTO(struct page *page),
- TP_ARGS(page, reclaim_flags),
+ TP_ARGS(page),
TP_STRUCT__entry(
__field(unsigned long, pfn)
@@ -342,7 +341,7 @@ TRACE_EVENT(mm_vmscan_writepage,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->reclaim_flags = reclaim_flags;
+ __entry->reclaim_flags = trace_reclaim_flags(page);
),
TP_printk("page=%p pfn=%lu flags=%s",
@@ -353,11 +352,11 @@ TRACE_EVENT(mm_vmscan_writepage,
TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
- TP_PROTO(int nid, int zid,
- unsigned long nr_scanned, unsigned long nr_reclaimed,
- int priority, int reclaim_flags),
+ TP_PROTO(struct zone *zone,
+ unsigned long nr_scanned, unsigned long nr_reclaimed,
+ int priority, int file),
- TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags),
+ TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file),
TP_STRUCT__entry(
__field(int, nid)
@@ -369,12 +368,12 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
),
TP_fast_assign(
- __entry->nid = nid;
- __entry->zid = zid;
+ __entry->nid = zone_to_nid(zone);
+ __entry->zid = zone_idx(zone);
__entry->nr_scanned = nr_scanned;
__entry->nr_reclaimed = nr_reclaimed;
__entry->priority = priority;
- __entry->reclaim_flags = reclaim_flags;
+ __entry->reclaim_flags = trace_shrink_flags(file);
),
TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
diff --git a/init/Kconfig b/init/Kconfig
index f1af42de48f4..29be2ed87338 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2103,13 +2103,6 @@ config INIT_ALL_POSSIBLE
it was better to provide this option than to break all the archs
and have several arch maintainers pursuing me down dark alleys.
-config STOP_MACHINE
- bool
- default y
- depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
- help
- Need stop_machine() primitive.
-
source "block/Kconfig"
config PREEMPT_NOTIFIERS
diff --git a/init/do_mounts.h b/init/do_mounts.h
index f5b978a9bb92..067af1d9e8b6 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -57,11 +57,11 @@ static inline int rd_load_image(char *from) { return 0; }
#ifdef CONFIG_BLK_DEV_INITRD
-int __init initrd_load(void);
+bool __init initrd_load(void);
#else
-static inline int initrd_load(void) { return 0; }
+static inline bool initrd_load(void) { return false; }
#endif
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 3e0878e8a80d..a1000ca29fc9 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -116,7 +116,7 @@ static void __init handle_initrd(void)
}
}
-int __init initrd_load(void)
+bool __init initrd_load(void)
{
if (mount_initrd) {
create_dev("/dev/ram", Root_RAM0);
@@ -129,9 +129,9 @@ int __init initrd_load(void)
if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) {
sys_unlink("/initrd.image");
handle_initrd();
- return 1;
+ return true;
}
}
sys_unlink("/initrd.image");
- return 0;
+ return false;
}
diff --git a/init/main.c b/init/main.c
index 9e64d7097f1a..23449b7dfe7a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -164,10 +164,10 @@ static const char *panic_later, *panic_param;
extern const struct obs_kernel_param __setup_start[], __setup_end[];
-static int __init obsolete_checksetup(char *line)
+static bool __init obsolete_checksetup(char *line)
{
const struct obs_kernel_param *p;
- int had_early_param = 0;
+ bool had_early_param = false;
p = __setup_start;
do {
@@ -179,13 +179,13 @@ static int __init obsolete_checksetup(char *line)
* Keep iterating, as we can have early
* params and __setups of same names 8( */
if (line[n] == '\0' || line[n] == '=')
- had_early_param = 1;
+ had_early_param = true;
} else if (!p->setup_func) {
pr_warn("Parameter %s is obsolete, ignored\n",
p->str);
- return 1;
+ return true;
} else if (p->setup_func(line + n))
- return 1;
+ return true;
}
p++;
} while (p < __setup_end);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 161a1807e6ef..f4617cf07069 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1438,7 +1438,7 @@ static int __init init_mqueue_fs(void)
mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
sizeof(struct mqueue_inode_info), 0,
- SLAB_HWCACHE_ALIGN, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
if (mqueue_inode_cachep == NULL)
return -ENOMEM;
diff --git a/ipc/msg.c b/ipc/msg.c
index 1471db9a7e61..59559a215401 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/ipc/sem.c b/ipc/sem.c
index b471e5a3863d..cddd5b5fde51 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1493,7 +1493,7 @@ out_rcu_wakeup:
wake_up_sem_queue_do(&tasks);
out_free:
if (sem_io != fast_sem_io)
- ipc_free(sem_io, sizeof(ushort)*nsems);
+ ipc_free(sem_io);
return err;
}
diff --git a/ipc/shm.c b/ipc/shm.c
index 41787276e141..ed3027d0f277 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -459,7 +459,7 @@ static const struct file_operations shm_file_operations_huge = {
.fallocate = shm_fallocate,
};
-int is_file_shm_hugepages(struct file *file)
+bool is_file_shm_hugepages(struct file *file)
{
return file->f_op == &shm_file_operations_huge;
}
diff --git a/ipc/util.c b/ipc/util.c
index 0f401d94b7c6..798cad18dd87 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -414,17 +414,12 @@ void *ipc_alloc(int size)
/**
* ipc_free - free ipc space
* @ptr: pointer returned by ipc_alloc
- * @size: size of block
*
- * Free a block created with ipc_alloc(). The caller must know the size
- * used in the allocation call.
+ * Free a block created with ipc_alloc().
*/
-void ipc_free(void *ptr, int size)
+void ipc_free(void *ptr)
{
- if (size > PAGE_SIZE)
- vfree(ptr);
- else
- kfree(ptr);
+ kvfree(ptr);
}
/**
diff --git a/ipc/util.h b/ipc/util.h
index 3a8a5a0eca62..51f7ca58ac67 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
* both function can sleep
*/
void *ipc_alloc(int size);
-void ipc_free(void *ptr, int size);
+void ipc_free(void *ptr);
/*
* For allocation that need to be freed by RCU.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e23b..5b9d39633ce9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
- = CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __read_mostly
+ = {CPU_BITS_ALL};
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
-
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
- if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
-{
- if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
-}
-
-void set_cpu_online(unsigned int cpu, bool online)
-{
- if (online) {
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- } else {
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
- }
-}
-
-void set_cpu_active(unsigned int cpu, bool active)
-{
- if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
-}
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
void init_cpu_present(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
+ cpumask_copy(&__cpu_present_mask, src);
}
void init_cpu_possible(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+ cpumask_copy(&__cpu_possible_mask, src);
}
void init_cpu_online(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ cpumask_copy(&__cpu_online_mask, src);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..0167679182c0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c6020a0..b0eea830303c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1120,8 +1120,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p) &&
- !(p->jobctl & JOBCTL_LISTENING))
+ if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/fork.c b/kernel/fork.c
index fce002ee3ddf..4800ab973083 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,9 +300,9 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", arch_task_struct_size,
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ task_struct_cachep = kmem_cache_create("task_struct",
+ arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -1848,16 +1848,19 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
* whole struct cpumask for the OFFSTACK case. We could change
@@ -1867,8 +1870,9 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..b29add22c454 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -519,46 +520,9 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ mapping = compound_head(page)->mapping;
+ if (!mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
@@ -588,7 +553,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -603,15 +568,15 @@ again:
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..f4ad91b746f1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..0c9f02506169 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
+#include <asm-generic/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -2658,13 +2659,36 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
static int __init printk_late_init(void)
{
struct console *con;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ /*
+ * Make sure to unregister boot consoles whose data
+ * resides in the init section before the init section
+ * is discarded. Boot consoles whose data will stick
+ * around will automatically be unregistered when the
+ * proper console replaces them.
+ */
+ if (init_section_intersects(con, sizeof(*con)))
+ unregister_console(con);
}
}
hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b760bae64cf1..aa94aee9d4c9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -364,8 +364,14 @@ unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
if (!retval) {
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
+ */
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 61101193967e..edb6de4f5908 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -529,7 +529,7 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-#ifdef CONFIG_STOP_MACHINE
+#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
@@ -629,4 +629,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
return ret ?: done.ret;
}
-#endif /* CONFIG_STOP_MACHINE */
+#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d6639e..3fd5fd76d62c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &mmap_rnd_bits_min,
+ .extra2 = &mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &mmap_rnd_compat_bits_min,
+ .extra2 = &mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
diff --git a/lib/Kconfig b/lib/Kconfig
index f0df318104e7..f7e64c7748cb 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 765f3e016ec7..334a3c25f8f5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
If unsure, say N.
+config DEBUG_VM_PGFLAGS
+ bool "Debug page-flags operations"
+ depends on DEBUG_VM
+ help
+ Enables extra validation on page flags operations.
+
+ If unsure, say N.
+
config DEBUG_VIRTUAL
bool "Debug VM translations"
depends on DEBUG_KERNEL && X86
@@ -1578,7 +1586,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
config LATENCYTOP
bool "Latency measuring infrastructure"
- depends on HAVE_LATENCYTOP_SUPPORT
depends on DEBUG_KERNEL
depends on STACKTRACE_SUPPORT
depends on PROC_FS
diff --git a/lib/Makefile b/lib/Makefile
index 8394cdb62de8..67945a4ab8dc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -32,7 +32,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
-obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o
+obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
@@ -84,6 +84,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 4527e751b5e0..b8f1d6cbb200 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -42,6 +42,27 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
EXPORT_SYMBOL_GPL(__iowrite32_copy);
/**
+ * __ioread32_copy - copy data from MMIO space, in 32-bit units
+ * @to: destination (must be 32-bit aligned)
+ * @from: source, in MMIO space (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ *
+ * Copy data from MMIO space to kernel space, in units of 32 bits at a
+ * time. Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void __ioread32_copy(void *to, const void __iomem *from, size_t count)
+{
+ u32 *dst = to;
+ const u32 __iomem *src = from;
+ const u32 __iomem *end = src + count;
+
+ while (src < end)
+ *dst++ = __raw_readl(src++);
+}
+EXPORT_SYMBOL_GPL(__ioread32_copy);
+
+/**
* __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units
* @to: destination, in MMIO space (must be 64-bit aligned)
* @from: source (must be 64-bit aligned)
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5939f63d90cd..5c88204b6f1f 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -43,50 +43,73 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
[STRING_UNITS_10] = 1000,
[STRING_UNITS_2] = 1024,
};
- int i, j;
- u32 remainder = 0, sf_cap, exp;
+ static const unsigned int rounding[] = { 500, 50, 5 };
+ int i = 0, j;
+ u32 remainder = 0, sf_cap;
char tmp[8];
const char *unit;
tmp[0] = '\0';
- i = 0;
- if (!size)
+
+ if (blk_size == 0)
+ size = 0;
+ if (size == 0)
goto out;
- while (blk_size >= divisor[units]) {
- remainder = do_div(blk_size, divisor[units]);
+ /* This is Napier's algorithm. Reduce the original block size to
+ *
+ * coefficient * divisor[units]^i
+ *
+ * we do the reduction so both coefficients are just under 32 bits so
+ * that multiplying them together won't overflow 64 bits and we keep
+ * as much precision as possible in the numbers.
+ *
+ * Note: it's safe to throw away the remainders here because all the
+ * precision is in the coefficients.
+ */
+ while (blk_size >> 32) {
+ do_div(blk_size, divisor[units]);
i++;
}
- exp = divisor[units] / (u32)blk_size;
- /*
- * size must be strictly greater than exp here to ensure that remainder
- * is greater than divisor[units] coming out of the if below.
- */
- if (size > exp) {
- remainder = do_div(size, divisor[units]);
- remainder *= blk_size;
+ while (size >> 32) {
+ do_div(size, divisor[units]);
i++;
- } else {
- remainder *= size;
}
+ /* now perform the actual multiplication keeping i as the sum of the
+ * two logarithms */
size *= blk_size;
- size += remainder / divisor[units];
- remainder %= divisor[units];
+ /* and logarithmically reduce it until it's just under the divisor */
while (size >= divisor[units]) {
remainder = do_div(size, divisor[units]);
i++;
}
+ /* work out in j how many digits of precision we need from the
+ * remainder */
sf_cap = size;
for (j = 0; sf_cap*10 < 1000; j++)
sf_cap *= 10;
- if (j) {
+ if (units == STRING_UNITS_2) {
+ /* express the remainder as a decimal. It's currently the
+ * numerator of a fraction whose denominator is
+ * divisor[units], which is 1 << 10 for STRING_UNITS_2 */
remainder *= 1000;
- remainder /= divisor[units];
+ remainder >>= 10;
+ }
+
+ /* add a 5 to the digit below what will be printed to ensure
+ * an arithmetical round up and carry it through to size */
+ remainder += rounding[j];
+ if (remainder >= 1000) {
+ remainder -= 1000;
+ size += 1;
+ }
+
+ if (j) {
snprintf(tmp, sizeof(tmp), ".%03u", remainder);
tmp[j+1] = '\0';
}
diff --git a/lib/test-hexdump.c b/lib/test_hexdump.c
index 5241df36eedf..b69341be7a85 100644
--- a/lib/test-hexdump.c
+++ b/lib/test_hexdump.c
@@ -42,19 +42,19 @@ static const char * const test_data_8_le[] __initconst = {
"e9ac0f9cad319ca6", "0cafb1439919d14c",
};
-static void __init test_hexdump(size_t len, int rowsize, int groupsize,
- bool ascii)
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+
+static void __init test_hexdump_prepare_test(size_t len, int rowsize,
+ int groupsize, char *test,
+ size_t testlen, bool ascii)
{
- char test[32 * 3 + 2 + 32 + 1];
- char real[32 * 3 + 2 + 32 + 1];
char *p;
const char * const *result;
size_t l = len;
int gs = groupsize, rs = rowsize;
unsigned int i;
- hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii);
-
if (rs != 16 && rs != 32)
rs = 16;
@@ -73,7 +73,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize,
else
result = test_data_1_le;
- memset(test, ' ', sizeof(test));
+ memset(test, ' ', testlen);
/* hex dump */
p = test;
@@ -95,11 +95,29 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize,
}
*p = '\0';
+}
+
+#define TEST_HEXDUMP_BUF_SIZE (32 * 3 + 2 + 32 + 1)
+
+static void __init test_hexdump(size_t len, int rowsize, int groupsize,
+ bool ascii)
+{
+ char test[TEST_HEXDUMP_BUF_SIZE];
+ char real[TEST_HEXDUMP_BUF_SIZE];
+
+ total_tests++;
+
+ hex_dump_to_buffer(data_b, len, rowsize, groupsize, real, sizeof(real),
+ ascii);
+
+ test_hexdump_prepare_test(len, rowsize, groupsize, test, sizeof(test),
+ ascii);
if (strcmp(test, real)) {
pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize);
pr_err("Result: '%s'\n", real);
pr_err("Expect: '%s'\n", test);
+ failed_tests++;
}
}
@@ -114,52 +132,71 @@ static void __init test_hexdump_set(int rowsize, bool ascii)
test_hexdump(len, rowsize, 1, ascii);
}
-static void __init test_hexdump_overflow(bool ascii)
+static void __init test_hexdump_overflow(size_t buflen, size_t len,
+ int rowsize, int groupsize,
+ bool ascii)
{
- char buf[56];
- const char *t = test_data_1_le[0];
- size_t l = get_random_int() % sizeof(buf);
+ char test[TEST_HEXDUMP_BUF_SIZE];
+ char buf[TEST_HEXDUMP_BUF_SIZE];
+ int rs = rowsize, gs = groupsize;
+ int ae, he, e, r;
bool a;
- int e, r;
+
+ total_tests++;
memset(buf, ' ', sizeof(buf));
- r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii);
+ r = hex_dump_to_buffer(data_b, len, rs, gs, buf, buflen, ascii);
+
+ ae = rs * 2 /* hex */ + rs / gs /* spaces */ + 1 /* space */ + len /* ascii */;
+ he = (gs * 2 /* hex */ + 1 /* space */) * len / gs - 1 /* no trailing space */;
if (ascii)
- e = 50;
+ e = ae;
else
- e = 2;
- buf[e + 2] = '\0';
-
- if (!l) {
- a = r == e && buf[0] == ' ';
- } else if (l < 3) {
- a = r == e && buf[0] == '\0';
- } else if (l < 4) {
- a = r == e && !strcmp(buf, t);
- } else if (ascii) {
- if (l < 51)
- a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' ';
- else
- a = r == e && buf[50] == '\0' && buf[49] == '.';
+ e = he;
+
+ if (!buflen) {
+ memset(test, ' ', sizeof(test));
+ test[sizeof(buf) - 1] = '\0';
+
+ a = r == e && !memchr_inv(buf, ' ', sizeof(buf));
} else {
- a = r == e && buf[e] == '\0';
+ int f = min_t(int, e + 1, buflen);
+
+ test_hexdump_prepare_test(len, rs, gs, test, sizeof(test), ascii);
+ test[f - 1] = '\0';
+
+ a = r == e && !memchr_inv(buf + f, ' ', sizeof(buf) - f) && !strcmp(buf, test);
}
+ buf[sizeof(buf) - 1] = '\0';
+
if (!a) {
- pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf));
- pr_err("Result: '%s'\n", buf);
+ pr_err("Len: %zu buflen: %zu strlen: %zu\n", len, buflen, strlen(buf));
+ pr_err("Result: %d '%s'\n", r, buf);
+ pr_err("Expect: %d '%s'\n", e, test);
+ failed_tests++;
}
}
+static void __init test_hexdump_overflow_set(size_t buflen, bool ascii)
+{
+ unsigned int i = 0;
+
+ do {
+ int gs = 1 << i;
+ size_t len = get_random_int() % 16 + gs;
+
+ test_hexdump_overflow(buflen, rounddown(len, gs), 16, gs, ascii);
+ } while (i++ < 3);
+}
+
static int __init test_hexdump_init(void)
{
unsigned int i;
int rowsize;
- pr_info("Running tests...\n");
-
rowsize = (get_random_int() % 2 + 1) * 16;
for (i = 0; i < 16; i++)
test_hexdump_set(rowsize, false);
@@ -168,13 +205,20 @@ static int __init test_hexdump_init(void)
for (i = 0; i < 16; i++)
test_hexdump_set(rowsize, true);
- for (i = 0; i < 16; i++)
- test_hexdump_overflow(false);
+ for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++)
+ test_hexdump_overflow_set(i, false);
- for (i = 0; i < 16; i++)
- test_hexdump_overflow(true);
+ for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++)
+ test_hexdump_overflow_set(i, true);
- return -EINVAL;
+ if (failed_tests == 0)
+ pr_info("all %u tests passed\n", total_tests);
+ else
+ pr_err("failed %u out of %u tests\n", failed_tests, total_tests);
+
+ return failed_tests ? -EINVAL : 0;
}
module_init(test_hexdump_init);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8ed2ffd963c5..7340353f8aea 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, cond_resched() is called to yield
- * the processor if necessary but otherwise does not sleep.
+ * In the absence of zone congestion, a short sleep or a cond_resched is
+ * performed to yield the processor and to allow other subsystems to make
+ * a forward progress.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
- cond_resched();
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that a particular
+ * WQ is congested if the worker thread is looping without
+ * ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout(1);
+ else
+ cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
diff --git a/mm/compaction.c b/mm/compaction.c
index de3e1e71cd9f..ac6c6943d2ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1708,7 +1708,10 @@ static void compact_nodes(void)
/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;
-/* This is the entry point for compacting all nodes via /proc/sys/vm */
+/*
+ * This is the entry point for compacting all nodes via
+ * /proc/sys/vm/compact_memory
+ */
int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
diff --git a/mm/debug.c b/mm/debug.c
index 668aa35191ca..c3300ee89bc9 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,6 +9,8 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
+#include <trace/events/gfpflags.h>
+#include <linux/page_owner.h>
static const struct trace_print_flags pageflag_names[] = {
{1UL << PG_locked, "locked" },
@@ -40,26 +42,24 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
#endif
};
-static void dump_flags(unsigned long flags,
+static const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names
+};
+
+static void dump_flag_names(unsigned long flags,
const struct trace_print_flags *names, int count)
{
const char *delim = "";
unsigned long mask;
int i;
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
+ pr_cont("(");
for (i = 0; i < count && flags; i++) {
@@ -79,20 +79,36 @@ static void dump_flags(unsigned long flags,
pr_cont(")\n");
}
+void dump_gfpflag_names(unsigned long gfp_flags)
+{
+ dump_flag_names(gfp_flags, gfpflag_names, ARRAY_SIZE(gfpflag_names));
+}
+
void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags)
{
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ unsigned long printflags = page->flags;
+
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
+ if (PageCompound(page))
+ pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+ pr_cont("\n");
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+
+ pr_emerg("flags: %#lx", printflags);
+ /* remove zone id */
+ printflags &= (1UL << NR_PAGEFLAGS) - 1;
+ dump_flag_names(printflags, pageflag_names, ARRAY_SIZE(pageflag_names));
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
+ printflags = page->flags & badflags;
+ pr_alert("bad because of flags: %#lx:", printflags);
+ dump_flag_names(printflags, pageflag_names,
+ ARRAY_SIZE(pageflag_names));
}
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
@@ -103,6 +119,7 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
dump_page_badflags(page, reason, 0);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
@@ -162,7 +179,9 @@ void dump_vma(const struct vm_area_struct *vma)
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ pr_emerg("flags: %#lx", vma->vm_flags);
+ dump_flag_names(vma->vm_flags, vmaflags_names,
+ ARRAY_SIZE(vmaflags_names));
}
EXPORT_SYMBOL(dump_vma);
@@ -233,8 +252,9 @@ void dump_mm(const struct mm_struct *mm)
"" /* This is here to not have a comma! */
);
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ pr_emerg("def_flags: %#lx(", mm->def_flags);
+ dump_flag_names(mm->def_flags, vmaflags_names,
+ ARRAY_SIZE(vmaflags_names));
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/filemap.c b/mm/filemap.c
index 1bb007624b53..834cd1425307 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- BUG_ON(page_mapped(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg);
+ gfp_mask, &memcg, false);
if (error)
return error;
}
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
@@ -653,7 +653,7 @@ err_insert:
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return error;
}
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c91b36..e95b0cb6ed81 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -116,8 +116,21 @@ retry:
}
}
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +143,10 @@ retry:
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +237,38 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -896,7 +924,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
-
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1063,6 @@ struct page *get_dump_page(unsigned long addr)
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1090,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1102,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
@@ -1119,7 +1145,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
@@ -1128,7 +1154,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1149,24 +1174,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
@@ -1175,7 +1189,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1196,12 +1209,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1210,7 +1217,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
@@ -1218,7 +1225,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1239,12 +1245,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1259,7 +1259,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 62fe06bb7d04..b41793b12a2d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -26,11 +26,41 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_NO_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -67,6 +97,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -106,6 +137,10 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
{
@@ -553,6 +588,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
@@ -561,6 +623,7 @@ static struct attribute *khugepaged_attr[] = {
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -638,6 +701,9 @@ static int __init hugepage_init(void)
err = register_shrinker(&huge_zero_page_shrinker);
if (err)
goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -655,6 +721,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
khugepaged_slab_exit();
@@ -711,6 +779,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
return entry;
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * ->lru in the tail pages is occupied by compound_head.
+ * Let's use ->mapping + ->index in the second tail page as list_head.
+ */
+ return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -724,7 +813,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -732,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return VM_FAULT_OOM;
}
@@ -748,7 +837,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
} else {
@@ -759,7 +848,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
int ret;
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
@@ -770,8 +859,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -865,6 +954,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
+ prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
}
@@ -956,19 +1046,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (unlikely(pmd_trans_splitting(pmd))) {
- /* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- pte_free(dst_mm, pgtable);
-
- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
- goto out;
- }
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
get_page(src_page);
- page_dup_rmap(src_page);
+ page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1008,37 +1089,6 @@ unlock:
spin_unlock(ptl);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
- }
-}
-
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
- put_page(page);
- }
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -1068,13 +1118,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg))) {
+ &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ false);
put_page(pages[i]);
}
kfree(pages);
@@ -1112,8 +1163,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr);
- mem_cgroup_commit_charge(pages[i], memcg, false);
+ page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
@@ -1124,7 +1175,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1141,7 +1192,7 @@ out_free_pages:
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
@@ -1171,7 +1222,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- if (page_mapcount(page) == 1) {
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part. We can do it by checking page_mapcount() on each sub-page, but
+ * it's expensive.
+ * The cheaper way is to check page_count() to be equal 1: every
+ * mapcount takes page reference reference, so this way we can
+ * guarantee, that the PMD is the only mapping.
+ * This can give false negative if somebody pinned the page, but that's
+ * fine.
+ */
+ if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1180,7 +1241,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
+ get_page(page);
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
@@ -1190,30 +1251,33 @@ alloc:
} else
new_page = NULL;
- if (unlikely(!new_page)) {
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
if (!page) {
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) {
- split_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
- put_user_huge_page(page);
+ put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+ true))) {
put_page(new_page);
if (page) {
- split_huge_page(page);
- put_user_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
+ put_page(page);
} else
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1233,10 +1297,10 @@ alloc:
spin_lock(ptl);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
@@ -1244,8 +1308,8 @@ alloc:
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- page_add_new_anon_rmap(new_page, vma, haddr);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1254,7 +1318,7 @@ alloc:
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
@@ -1308,7 +1372,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ */
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
@@ -1318,7 +1395,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
@@ -1459,7 +1536,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
spinlock_t *ptl;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
@@ -1481,7 +1558,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1493,13 +1570,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
- int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
@@ -1508,7 +1584,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
@@ -1516,15 +1592,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
- goto out;
+ return false;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
- if (ret == 1) {
+ if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1540,9 +1615,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ return true;
}
-out:
- return ret;
+ return false;
}
/*
@@ -1558,7 +1633,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
@@ -1589,405 +1664,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
*
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
*ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- return -1;
- } else {
- /* Thp mapped by 'pmd' is stable, so we can
- * handle it as it is. */
- return 1;
- }
- }
- spin_unlock(*ptl);
- return 0;
-}
-
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- /*
- * split_vma() may create temporary aliased mappings. There is
- * no risk as long as all huge pmd are found and have their
- * splitting bit set before __split_huge_page_refcount
- * runs. Finding the same huge pmd more than once during the
- * same rmap walk is not a problem.
- */
- if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd))
- goto unlock;
- if (pmd_trans_huge(*pmd)) {
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
- !pmd_trans_splitting(*pmd));
- return pmd;
- }
-unlock:
+ if (likely(pmd_trans_huge(*pmd)))
+ return true;
spin_unlock(*ptl);
- return NULL;
-}
-
-static int __split_huge_page_splitting(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd;
- int ret = 0;
- /* For mmu_notifiers */
- const unsigned long mmun_start = address;
- const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
- if (pmd) {
- /*
- * We can't temporarily set the pmd to null in order
- * to split it, the pmd must remain marked huge at all
- * times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->rwsem to
- * serialize against split_huge_page*.
- */
- pmdp_splitting_flush(vma, address, pmd);
-
- ret = 1;
- spin_unlock(ptl);
- }
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
-{
- int i;
- struct zone *zone = page_zone(page);
- struct lruvec *lruvec;
- int tail_count = 0;
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
-
- compound_lock(page);
- /* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(page);
-
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
- struct page *page_tail = page + i;
-
- /* tail_page->_mapcount cannot change */
- BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
- /*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
- */
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
-
- /* after clearing PageTail the gup refcount can be released */
- smp_mb__after_atomic();
-
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
-
- clear_compound_head(page_tail);
-
- if (page_is_young(page))
- set_page_young(page_tail);
- if (page_is_idle(page))
- set_page_idle(page_tail);
-
- /*
- * __split_huge_page_splitting() already set the
- * splitting bit in all pmd that could map this
- * hugepage, that will ensure no CPU can alter the
- * mapcount on the head page. The mapcount is only
- * accounted in the head page and it has to be
- * transferred to all tail pages in the below code. So
- * for this code to be safe, the split the mapcount
- * can't change. But that doesn't mean userland can't
- * keep changing and reading the page contents while
- * we transfer the mapcount, so the pmd splitting
- * status is achieved setting a reserved bit in the
- * pmd, not by clearing the present bit.
- */
- page_tail->_mapcount = page->_mapcount;
-
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
-
- page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(page, page_tail, lruvec, list);
- }
- atomic_sub(tail_count, &page->_count);
- BUG_ON(atomic_read(&page->_count) <= 0);
-
- __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
- ClearPageCompound(page);
- compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
- }
-
- /*
- * Only the head page (now become a regular page) is required
- * to be pinned by the caller.
- */
- BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd, _pmd;
- int ret = 0, i;
- pgtable_t pgtable;
- unsigned long haddr;
-
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
- if (pmd_write(*pmd))
- BUG_ON(page_mapcount(page) != 1);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
-
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_pmd_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
- spin_unlock(ptl);
- }
-
- return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
-{
- int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
-
- BUG_ON(!PageHead(page));
- BUG_ON(PageTail(page));
-
- mapcount = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount += __split_huge_page_splitting(page, vma, addr);
- }
- /*
- * It is critical that new vmas are added to the tail of the
- * anon_vma list. This guarantes that if copy_huge_pmd() runs
- * and establishes a child pmd before
- * __split_huge_page_splitting() freezes the parent pmd (so if
- * we fail to prevent copy_huge_pmd() from running until the
- * whole __split_huge_page() is complete), we will still see
- * the newly established pmd of the child later during the
- * walk, to be able to set it as pmd_trans_splitting too.
- */
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
-
- __split_huge_page_refcount(page, list);
-
- mapcount2 = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount2 += __split_huge_page_map(page, vma, addr);
- }
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
- struct anon_vma *anon_vma;
- int ret = 1;
-
- BUG_ON(is_huge_zero_page(page));
- BUG_ON(!PageAnon(page));
-
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(page);
- if (!anon_vma)
- goto out;
- anon_vma_lock_write(anon_vma);
-
- ret = 0;
- if (!PageCompound(page))
- goto out_unlock;
-
- BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
- count_vm_event(THP_SPLIT);
-
- BUG_ON(PageCompound(page));
-out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
-out:
- return ret;
+ return false;
}
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2198,26 +1887,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2229,8 +1925,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
@@ -2239,6 +1937,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ result = SCAN_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
@@ -2246,6 +1945,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
@@ -2260,6 +1960,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -2273,10 +1974,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
- return 1;
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
out:
release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
return 0;
}
@@ -2321,7 +2033,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page);
+ page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -2431,6 +2143,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
return NULL;
}
+ prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
@@ -2442,8 +2155,12 @@ static int khugepaged_find_target_node(void)
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2493,7 +2210,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
-
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
@@ -2502,6 +2218,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return true;
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long _address;
+ pte_t *pte, pteval;
+ int swapped_in = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
@@ -2513,7 +2267,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated = 0, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2528,12 +2282,15 @@ static void collapse_huge_page(struct mm_struct *mm,
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
- if (!new_page)
- return;
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
- return;
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
/*
* Prevent all access to pagetables with the exception of
@@ -2541,21 +2298,33 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ result = SCAN_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ result = SCAN_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ result = SCAN_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ result = SCAN_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
+
+ __collapse_huge_page_swapin(mm, vma, address, pmd);
anon_vma_lock_write(vma->anon_vma);
@@ -2592,6 +2361,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
goto out;
}
@@ -2618,8 +2388,8 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -2629,12 +2399,14 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL;
khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
return;
-
out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -2645,39 +2417,62 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
- struct page *page;
+ int ret = 0, none_or_zero = 0, result = 0;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2685,26 +2480,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (referenced && writable)
- ret = 1;
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_NO_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
@@ -2713,6 +2530,8 @@ out_unmap:
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ none_or_zero, result, unmapped);
return ret;
}
@@ -2938,8 +2757,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2958,66 +2777,149 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
put_huge_zero_page();
}
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
{
- spinlock_t *ptl;
- struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ bool young, write;
+ int i;
- BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!pmd_trans_huge(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd)))
- goto unlock;
if (vma_is_dax(vma)) {
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ return;
} else if (is_huge_zero_pmd(*pmd)) {
- __split_huge_zero_page_pmd(vma, haddr, pmd);
- } else {
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- unlock:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (!page)
- return;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ write = pmd_write(*pmd);
+ young = pmd_young(*pmd);
- split_huge_page(page);
- put_page(page);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
/*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
*/
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * and pmd_trans_splitting must remain set at all times on the pmd
+ * until the split is complete for this pmd), then we flush the SMP TLB
+ * and finally we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ pmdp_invalidate(vma, haddr, pmd);
+ pmd_populate(mm, pmd, pgtable);
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
}
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address)
{
- struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
- vma = find_vma(mm, address);
- BUG_ON(vma == NULL);
- split_huge_page_pmd(vma, address, pmd);
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto out;
+ page = pmd_page(*pmd);
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
@@ -3026,7 +2928,7 @@ static void split_huge_page_address(struct mm_struct *mm,
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
+ pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -3035,13 +2937,13 @@ static void split_huge_page_address(struct mm_struct *mm,
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd_mm(mm, address, pmd);
+ split_huge_pmd(vma, pmd, address);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3057,7 +2959,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, start);
+ split_huge_pmd_address(vma, start);
/*
* If the new end address isn't hpage aligned and it could
@@ -3067,7 +2969,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, end);
+ split_huge_pmd_address(vma, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3081,6 +2983,427 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_page_address(next->vm_mm, nstart);
+ split_huge_pmd_address(next, nstart);
}
}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+ pmd = pmd_offset(pud, address);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return;
+ }
+ if (pmd_trans_huge(*pmd)) {
+ if (page == pmd_page(*pmd))
+ __split_huge_pmd_locked(vma, pmd, address, true);
+ spin_unlock(ptl);
+ return;
+ }
+ spin_unlock(ptl);
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ pte_t entry, swp_pte;
+ swp_entry_t swp_entry;
+
+ if (!pte_present(pte[i]))
+ continue;
+ if (page_to_pfn(page) != pte_pfn(pte[i]))
+ continue;
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = ptep_clear_flush(vma, address, pte + i);
+ swp_entry = make_migration_entry(page, pte_write(entry));
+ swp_pte = swp_entry_to_pte(swp_entry);
+ if (pte_soft_dirty(entry))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+ page_remove_rmap(page, false);
+ put_page(page);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+ pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long haddr;
+
+ haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, haddr);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte, entry;
+ swp_entry_t swp_entry;
+ int i;
+
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ if (!is_swap_pte(pte[i]))
+ continue;
+
+ swp_entry = pte_to_swp_entry(pte[i]);
+ if (!is_migration_entry(swp_entry))
+ continue;
+ if (migration_entry_to_page(swp_entry) != page)
+ continue;
+
+ get_page(page);
+ page_add_anon_rmap(page, vma, address, false);
+
+ entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ entry = pte_mkdirty(entry);
+ if (is_write_migration_entry(swp_entry))
+ entry = maybe_mkwrite(entry, vma);
+
+ flush_dcache_page(page);
+ set_pte_at(vma->vm_mm, address, pte + i, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte + i);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff, pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ unfreeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ int mapcount;
+ struct page *page_tail = head + tail;
+
+ mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+ /*
+ * tail_page->_count is zero and not changing from under us. But
+ * get_page_unless_zero() may be running from under us on the
+ * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is implemented in C not
+ * using locked ops. spin_unlock on x86 sometime uses locked ops
+ * because of PPro errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and not only on x86),
+ * it's safer to use atomic_add().
+ */
+ atomic_add(mapcount + 1, &page_tail->_count);
+
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * After clearing PageTail the gup refcount can be released.
+ * Page flags also must be visible before we make the page non-compound.
+ */
+ smp_wmb();
+
+ clear_compound_head(page_tail);
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+
+ page_tail->index = head->index + tail;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ lru_add_page_tail(head, page_tail, lruvec, list);
+
+ return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct zone *zone = page_zone(head);
+ struct lruvec *lruvec;
+ int i, tail_mapcount;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(head, zone);
+
+ spin_lock(&split_queue_lock);
+ if (!list_empty(page_deferred_list(head))) {
+ split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock(&split_queue_lock);
+
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(head);
+
+ tail_mapcount = 0;
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+ atomic_sub(tail_mapcount, &head->_count);
+
+ ClearPageCompound(head);
+ spin_unlock_irq(&zone->lru_lock);
+
+ unfreeze_page(page_anon_vma(head), head);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
+ }
+}
+
+int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct anon_vma *anon_vma;
+ int count, mapcount, ret;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+
+ /*
+ * Racy check if we can split the page, before freeze_page() will
+ * split PMDs
+ */
+ if (total_mapcount(head) != page_count(head) - 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ freeze_page(anon_vma, head);
+ VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+ count = page_count(head);
+ mapcount = total_mapcount(head);
+ if (mapcount == count - 1) {
+ __split_huge_page(page, list);
+ ret = 0;
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+ BUG();
+ } else {
+ unfreeze_page(anon_vma, head);
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ list_add_tail(page_deferred_list(page), &split_queue);
+ split_queue_len++;
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /*
+ * Split a page from split_queue will free up at least one page,
+ * at most HPAGE_PMD_NR - 1. We don't track exact number.
+ * Let's use HPAGE_PMD_NR / 2 as ballpark.
+ */
+ return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_init(&split_queue, &list);
+
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ /* race with put_compound_page() */
+ if (!get_page_unless_zero(page)) {
+ list_del_init(page_deferred_list(page));
+ split_queue_len--;
+ }
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ lock_page(page);
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_tail(&list, &split_queue);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+};
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 827bb02a43a4..1101ccd94d46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1258,8 +1258,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __SetPageHead(page);
__ClearPageReserved(page);
+ __SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
@@ -1886,7 +1886,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
-
+ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
+ }
spin_lock(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
@@ -3126,7 +3129,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- page_dup_rmap(ptepage);
+ page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
}
@@ -3210,7 +3213,7 @@ again:
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
address += sz;
@@ -3439,7 +3442,7 @@ retry_avoidcopy:
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
@@ -3609,7 +3612,7 @@ retry:
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
} else
- page_dup_rmap(page);
+ page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3889,7 +3892,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 38e24b89e4c4..4ae7b7c7462b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -33,6 +34,10 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -66,50 +71,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
- if (get_page_head)
- atomic_inc(&compound_head(page)->_count);
- get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -309,10 +270,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
- struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+ return address;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 19423a45d7d7..25c0ad36fe38 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -122,8 +122,7 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
- __GFP_NOACCOUNT)) | \
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
diff --git a/mm/ksm.c b/mm/ksm.c
index b5cd647daa52..30cb0f753e19 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
up_read(&mm->mmap_sem);
}
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
-}
-
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -958,13 +944,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -977,33 +963,6 @@ out:
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -1022,9 +981,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (page == kpage) /* ksm page forked */
return 0;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1037,6 +993,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
@@ -1067,6 +1030,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
}
}
+out_unlock:
unlock_page(page);
out:
return err;
@@ -1639,8 +1603,7 @@ next_mm:
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1903,7 +1866,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/memblock.c b/mm/memblock.c
index d300f1329814..1ab7b9e1b0fc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1509,12 +1509,12 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
-int __init memblock_is_reserved(phys_addr_t addr)
+bool __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
-int __init_memblock memblock_is_memory(phys_addr_t addr)
+bool __init_memblock memblock_is_memory(phys_addr_t addr)
{
return memblock_search(&memblock.memory, addr) != -1;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9acfb165eb52..79a29d564bff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -316,7 +316,7 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
+ if (cg_proto && cg_proto->active &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -696,7 +696,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- int nr_pages)
+ bool compound, int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -709,9 +709,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
+ }
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
@@ -2128,7 +2130,7 @@ done_restock:
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
- current->memcg_nr_pages_over_high += nr_pages;
+ current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
}
@@ -2332,7 +2334,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2340,6 +2342,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
VM_BUG_ON(!is_root_cache(cachep));
+ if (cachep->flags & SLAB_ACCOUNT)
+ gfp |= __GFP_ACCOUNT;
+
+ if (!(gfp & __GFP_ACCOUNT))
+ return cachep;
+
if (current->memcg_kmem_skip_account)
return cachep;
@@ -2435,9 +2443,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -4499,38 +4505,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
+ bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
unsigned long flags;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
+ VM_BUG_ON(compound && !PageTransHuge(page));
/*
* Prevent mem_cgroup_replace_page() from looking at
* page->mem_cgroup of its source page while we change it.
*/
+ ret = -EBUSY;
if (!trylock_page(page))
goto out;
@@ -4585,9 +4583,9 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
+ mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -4677,7 +4675,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4861,17 +4859,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
union mc_target target;
struct page *page;
- /*
- * We don't take compound_lock() here but no race with splitting thp
- * happens because:
- * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
- * under splitting, which means there's no concurrent thp split,
- * - if another thread runs into split_huge_page() just after we
- * entered this if-block, the thread must wait for page table lock
- * to be unlocked in __split_huge_page_splitting(), where the main
- * part of thp split is not executed yet.
- */
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
@@ -4880,7 +4868,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
@@ -4909,7 +4897,8 @@ retry:
page = target.page;
if (isolate_lru_page(page))
goto put;
- if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5250,10 +5239,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* with mem_cgroup_cancel_charge() in case page instantiation fails.
*/
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
{
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
@@ -5283,11 +5273,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
}
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5316,9 +5301,9 @@ out:
* Use mem_cgroup_cancel_charge() to cancel the transaction instead.
*/
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5335,13 +5320,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
@@ -5363,9 +5343,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
if (mem_cgroup_disabled())
return;
@@ -5377,11 +5358,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
if (!memcg)
return;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
cancel_charge(memcg, nr_pages);
}
@@ -5627,7 +5603,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -1);
+ mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8424b64711ac..1b99403d76f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
- if (PageHuge(head))
- return get_page_unless_zero(head);
-
- /*
- * Thp tail page has special refcounting rule (refcount of tail pages
- * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
- * directly for tail pages.
- */
- if (PageTransHuge(head)) {
+ if (!PageHuge(head) && PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
page_to_pfn(page));
return 0;
}
-
- if (get_page_unless_zero(head)) {
- if (PageTail(page))
- get_page(page);
- return 1;
- } else {
- return 0;
- }
}
- return get_page_unless_zero(page);
+ return get_page_unless_zero(head);
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
- struct page *head = compound_head(page);
-
- if (PageHuge(head)) {
- put_page(head);
- return;
- }
-
- if (PageTransHuge(head))
- if (page != head)
- put_page(head);
-
- put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
+ lock_page(hpage);
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
if (!PageAnon(hpage))
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
put_hwpoison_page(p);
return -EBUSY;
}
+ unlock_page(hpage);
+ get_hwpoison_page(p);
+ put_hwpoison_page(hpage);
VM_BUG_ON_PAGE(!page_count(p), p);
hpage = compound_head(p);
}
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
* Did it turn free?
*/
ret = __get_any_page(page, pfn, 0);
- if (!PageLRU(page)) {
+ if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1750,21 +1718,28 @@ int soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
return -EBUSY;
}
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
- pr_info("soft offline: %#lx: failed to split THP\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
- }
- }
get_online_mems();
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
+
if (ret > 0) { /* for in-use pages */
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ lock_page(hpage);
+ ret = split_huge_page(hpage);
+ unlock_page(hpage);
+ if (unlikely(ret || PageTransCompound(page) ||
+ !PageAnon(page))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(hpage);
+ return -EBUSY;
+ }
+ get_hwpoison_page(page);
+ put_hwpoison_page(hpage);
+ }
+
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
diff --git a/mm/memory.c b/mm/memory.c
index c387430f06c3..ed00801ce0c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -566,7 +566,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -586,18 +585,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
- wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
- } else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ }
spin_unlock(ptl);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -613,8 +608,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- } else
- VM_BUG_ON(pmd_trans_splitting(*pmd));
+ }
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -832,10 +826,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
} else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ rss[mm_counter(page)]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -873,11 +864,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
}
out_set_pte:
@@ -1113,9 +1101,8 @@ again:
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else {
+
+ if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
force_flush = 1;
set_page_dirty(page);
@@ -1123,9 +1110,9 @@ again:
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
- rss[MM_FILEPAGES]--;
}
- page_remove_rmap(page);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1146,11 +1133,7 @@ again:
struct page *page;
page = migration_entry_to_page(entry);
-
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else
- rss[MM_FILEPAGES]--;
+ rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
@@ -1204,7 +1187,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1460,7 +1443,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -2083,7 +2066,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
cow_user_page(new_page, old_page, address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -2097,7 +2080,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm, MM_FILEPAGES);
+ dec_mm_counter_fast(mm,
+ mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {
@@ -2113,8 +2097,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2146,14 +2130,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
@@ -2168,7 +2152,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
- munlock_vma_page(old_page);
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
@@ -2443,7 +2428,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
@@ -2528,7 +2513,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2562,7 +2547,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
- exclusive = 1;
+ exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
@@ -2570,10 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2608,7 +2593,7 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
@@ -2702,7 +2687,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
@@ -2723,15 +2708,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2742,7 +2727,7 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
goto unlock;
oom_free_page:
@@ -2818,9 +2803,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, false);
} else {
- inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page);
}
set_pte_at(vma->vm_mm, address, pte, entry);
@@ -2993,7 +2978,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
@@ -3022,7 +3007,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
@@ -3037,7 +3022,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
page_cache_release(new_page);
return ret;
}
@@ -3089,7 +3074,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3191,6 +3176,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
@@ -3366,14 +3357,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- /*
- * If the pmd is splitting, return and retry the
- * the fault. Alternative: wait until the split
- * is done, and goto retry.
- */
- if (pmd_trans_splitting(orig_pmd))
- return 0;
-
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..496214bd82e2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,31 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
+ int nid, ret;
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
- return 0;
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ } else {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return 0;
+ }
+ }
+ }
+retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -513,6 +530,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
+ if (PageTail(page) && PageAnon(page)) {
+ get_page(page);
+ pte_unmap_unlock(pte, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ /* Failed to split -- skip. */
+ if (ret) {
+ pte = pte_offset_map_lock(walk->mm, pmd,
+ addr, &ptl);
+ continue;
+ }
+ goto retry;
+ }
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, qp->pagelist, flags);
@@ -2142,12 +2174,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->lock spinlock, which should be held
+ * They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
-/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/*
+ * lookup first element intersecting start-end. Caller holds sp->lock for
+ * reading or for writing
+ */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2178,8 +2212,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
return rb_entry(n, struct sp_node, nd);
}
-/* Insert a new shared policy into the list. */
-/* Caller holds sp->lock */
+/*
+ * Insert a new shared policy into the list. Caller holds sp->lock for
+ * writing.
+ */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
@@ -2211,13 +2247,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ read_unlock(&sp->lock);
return pol;
}
@@ -2360,7 +2396,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
int ret = 0;
restart:
- spin_lock(&sp->lock);
+ write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2393,7 +2429,7 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = 0;
err_out:
@@ -2405,7 +2441,7 @@ err_out:
return ret;
alloc_new:
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
@@ -2431,7 +2467,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2497,14 +2533,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/migrate.c b/mm/migrate.c
index 7890d0bb5e23..12e9ab9de446 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
@@ -46,6 +47,16 @@
#include "internal.h"
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
+};
+
/*
* migrate_prep() needs to be called before we start compiling a list of pages
* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -165,9 +176,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
- page_dup_rmap(new);
+ page_dup_rmap(new, true);
} else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr);
+ page_add_anon_rmap(new, vma, addr, false);
else
page_add_file_rmap(new);
@@ -578,6 +589,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
}
/************************************************************
@@ -943,13 +956,19 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
+ if (unlikely(PageTransHuge(page))) {
+ lock_page(page);
+ rc = split_huge_page(page);
+ unlock_page(page);
+ if (rc)
goto out;
+ }
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -1014,7 +1033,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1072,6 +1091,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
@@ -1144,7 +1164,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1756,6 +1776,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
+ prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1767,7 +1788,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1836,7 @@ fail_putback:
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
+ page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1847,15 @@ fail_putback:
flush_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
+ page_remove_rmap(new_page, true);
goto fail_putback;
}
mlock_migrate_page(new_page, page);
set_page_memcg(new_page, page_memcg(page));
set_page_memcg(page, NULL);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb37f0c..2a565ed8bb49 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 339d9e0949b6..af421d8bd6da 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
/* Serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
@@ -425,7 +430,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
while (start < end) {
- struct page *page = NULL;
+ struct page *page;
unsigned int page_mask;
unsigned long page_increm;
struct pagevec pvec;
@@ -444,7 +449,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
&page_mask);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
lock_page(page);
/*
* Any THP page found by follow_page_mask() may
@@ -477,8 +485,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ce04a649f6b..b513f209f650 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -58,6 +58,18 @@
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -1551,9 +1563,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
- if (!(vm_flags & MAP_FIXED))
- return -ENOMEM;
-
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
@@ -2988,14 +2997,7 @@ out:
*/
int may_expand_vm(struct mm_struct *mm, unsigned long npages)
{
- unsigned long cur = mm->total_vm; /* pages */
- unsigned long lim;
-
- lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
-
- if (cur + npages > lim)
- return 0;
- return 1;
+ return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT;
}
static int special_mapping_fault(struct vm_area_struct *vma,
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7d87ebb0d632..52687fb4de6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
}
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-int memmap_valid_within(unsigned long pfn,
+bool memmap_valid_within(unsigned long pfn,
struct page *page, struct zone *zone)
{
if (page_to_pfn(page) != pfn)
- return 0;
+ return false;
if (page_zone(page) != zone)
- return 0;
+ return false;
- return 1;
+ return true;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..9c1445dc8a4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,7 +160,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index c25bc6268e46..9acf51a4f682 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (!new_pmd)
break;
if (pmd_trans_huge(*old_pmd)) {
- int err = 0;
if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
- err = move_huge_pmd(vma, new_vma, old_addr,
+ moved = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (need_rmap_locks)
anon_vma_unlock_write(vma->anon_vma);
+ if (moved) {
+ need_flush = true;
+ continue;
+ }
}
- if (err > 0) {
- need_flush = true;
- continue;
- } else if (!err) {
- split_huge_page_pmd(vma, old_addr, old_pmd);
- }
+ split_huge_pmd(vma, old_pmd, old_addr);
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..fbf6f0f1d6c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -560,7 +560,7 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
- vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+ vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d13a33918fa2..542d56c93209 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,10 +386,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
- current->signal->oom_score_adj);
+ pr_warning("%s invoked oom-killer: order=%d, oom_score_adj=%hd, "
+ "gfp_mask=0x%x",
+ current->comm, oc->order, current->signal->oom_score_adj,
+ oc->gfp_mask);
+ dump_gfpflag_names(oc->gfp_mask);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
@@ -585,10 +587,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+ K(get_mm_counter(victim->mm, MM_FILEPAGES)),
+ K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
task_unlock(victim);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66639a9..d6d7c97c0b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -229,13 +229,28 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
-static void free_compound_page(struct page *page);
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
free_huge_page,
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ free_transhuge_page,
+#endif
};
int min_free_kbytes = 1024;
@@ -433,6 +448,7 @@ static void bad_page(struct page *page, const char *reason,
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
dump_page_badflags(page, reason, bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -457,7 +473,7 @@ out:
* This usage means that zero-order pages may not be compound.
*/
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
{
__free_pages_ok(page, compound_order(page));
}
@@ -473,8 +489,10 @@ void prep_compound_page(struct page *page, unsigned int order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -739,7 +757,7 @@ static inline int free_pages_check(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -863,6 +881,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
ret = 0;
goto out;
}
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set", 0);
goto out;
@@ -873,6 +912,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
ret = 0;
out:
+ page->mapping = NULL;
clear_compound_head(page);
return ret;
}
@@ -1336,7 +1376,7 @@ static inline int check_new_page(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -2671,9 +2711,9 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
+ pr_warn("%s: page allocation failure: order:%u, mode:0x%x",
current->comm, order, gfp_mask);
-
+ dump_gfpflag_names(gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2876,28 +2916,6 @@ retry:
return page;
}
-/*
- * This is called in the allocator slow-path if the allocation request is of
- * sufficient urgency to ignore watermarks and take other desperate measures
- */
-static inline struct page *
-__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
- const struct alloc_context *ac)
-{
- struct page *page;
-
- do {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
-
- if (!page && gfp_mask & __GFP_NOFAIL)
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
- HZ/50);
- } while (!page && (gfp_mask & __GFP_NOFAIL));
-
- return page;
-}
-
static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
@@ -3042,28 +3060,36 @@ retry:
* allocations are system rather than user orientated
*/
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
-
- page = __alloc_pages_high_priority(gfp_mask, order, ac);
-
- if (page) {
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ if (page)
goto got_pg;
- }
}
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
- * All existing users of the deprecated __GFP_NOFAIL are
- * blockable, so warn of any new users that actually allow this
- * type of allocation to fail.
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually allow this type of allocation
+ * to fail.
*/
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
}
/* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * __GFP_NOFAIL request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us.
+ */
+ if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ cond_resched();
+ goto retry;
+ }
goto nopage;
+ }
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
@@ -3402,7 +3428,8 @@ EXPORT_SYMBOL(__free_page_frag);
/*
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
+ * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
+ * equivalent to alloc_pages.
*
* It should be used when the caller would like to use kmalloc, but since the
* allocation is large, it has to fall back to the page allocator.
@@ -3647,8 +3674,9 @@ static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
@@ -6723,8 +6751,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ /*
+ * In case of -EBUSY, we'd like to know which page causes problem.
+ * So, just fall through. We will check it in test_pages_isolated().
+ */
ret = __alloc_contig_migrate_range(&cc, start, end);
- if (ret)
+ if (ret && ret != -EBUSY)
goto done;
/*
@@ -6751,12 +6783,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
if (++order >= MAX_ORDER) {
- ret = -EBUSY;
- goto done;
+ outer_start = start;
+ break;
}
outer_start &= ~0UL << order;
}
+ if (outer_start != start) {
+ order = page_order(pfn_to_page(outer_start));
+
+ /*
+ * outer_start page could be small order buddy page and
+ * it doesn't include start page. Adjust outer_start
+ * in this case to report failed page properly
+ * on tracepoint in test_pages_isolated()
+ */
+ if (outer_start + (1UL << order) <= start)
+ outer_start = start;
+ }
+
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
pr_info("%s: [%lx, %lx) PFNs busy\n",
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd79041484..374931f32ebc 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -55,25 +55,22 @@ static int page_idle_clear_pte_refs_one(struct page *page,
unsigned long addr, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
bool referenced = false;
- if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (pmd) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
- spin_unlock(ptl);
- }
- } else {
- pte = page_check_address(page, mm, addr, &ptl, 0);
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap_unlock(pte, ptl);
- }
- }
+ if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
+
+ if (pte) {
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap(pte);
+ } else
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
+
+ spin_unlock(ptl);
+
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4568fd58f70a..f484b9300fe3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,6 +9,9 @@
#include <linux/hugetlb.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_isolation.h>
+
static int set_migratetype_isolate(struct page *page,
bool skip_hwpoisoned_pages)
{
@@ -212,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*
* Returns 1 if all pages in the range are isolated.
*/
-static int
+static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -237,9 +240,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
else
break;
}
- if (pfn < end_pfn)
- return 0;
- return 1;
+
+ return pfn;
}
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
@@ -248,7 +250,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
- int ret;
/*
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -266,10 +267,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
- return ret ? 0 : -EBUSY;
+
+ trace_test_pages_isolated(start_pfn, end_pfn, pfn);
+
+ return pfn < end_pfn ? -EBUSY : 0;
}
struct page *alloc_migrate_target(struct page *page, unsigned long private,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..a81cfa0c13c3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -110,11 +145,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
+ migratetype_names[pageblock_mt],
PageLocked(page) ? "K" : " ",
PageError(page) ? "E" : " ",
PageReferenced(page) ? "R" : " ",
@@ -135,6 +170,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -150,6 +193,31 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+ ;
+ pr_alert("page allocated via order %u, migratetype %s, gfp_mask 0x%x",
+ page_ext->order, migratetype_names[mt], gfp_mask);
+ dump_gfpflag_names(gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
@@ -157,7 +225,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -305,7 +373,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b853ae..207244489a68 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index 8a943b97a053..998607adf6eb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
- * @size: size of the area
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
{
- if (size <= PAGE_SIZE)
- kfree(ptr);
- else
- vfree(ptr);
+ kvfree(ptr);
}
/**
@@ -463,8 +459,8 @@ out_unlock:
* pcpu_mem_free() might end up calling vfree() which uses
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
*/
- pcpu_mem_free(old, old_size);
- pcpu_mem_free(new, new_size);
+ pcpu_mem_free(old);
+ pcpu_mem_free(new);
return 0;
}
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk);
return NULL;
}
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk->map);
+ pcpu_mem_free(chunk);
}
/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 7d3db0247983..c311a2ec6fea 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,18 +139,6 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- /* tlb flush only to serialize against gup-fast */
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
@@ -176,13 +164,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
/* FIFO */
pgtable = pmd_huge_pte(mm, pmdp);
- if (list_empty(&pgtable->lru))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
- struct page, lru);
+ pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
+ struct page, lru);
+ if (pmd_huge_pte(mm, pmdp))
list_del(&pgtable->lru);
- }
return pgtable;
}
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..6f371261dd12 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data)
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
- anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+ anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+ SLAB_PANIC|SLAB_ACCOUNT);
}
/*
@@ -565,27 +567,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
anon_vma_unlock_read(anon_vma);
}
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address = __vma_address(page, vma);
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
- return address;
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
static void percpu_flush_tlb_batch_pages(void *data)
{
@@ -817,6 +798,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+ unsigned long address, pmd_t **pmdp,
+ pte_t **ptep, spinlock_t **ptlp)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return false;
+
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ pmd = NULL;
+ goto check_pte;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return false;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return false;
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ goto map_pte;
+ }
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ pte = NULL;
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return false;
+ } else {
+ pmd_t pmde = *pmd;
+
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ return false;
+ }
+map_pte:
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return false;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+check_pte:
+ spin_lock(ptl);
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+found:
+ *ptep = pte;
+ *pmdp = pmd;
+ *ptlp = ptl;
+ return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
struct page_referenced_arg {
int mapcount;
int referenced;
@@ -830,49 +901,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
+ struct page_referenced_arg *pra = arg;
+ pmd_t *pmd;
+ pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
- struct page_referenced_arg *pra = arg;
- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
+ if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
-
- /* go ahead even if the pmd is pmd_trans_splitting() */
- if (pmdp_clear_flush_young_notify(vma, address, pmd))
- referenced++;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (pte)
+ pte_unmap(pte);
spin_unlock(ptl);
- } else {
- pte_t *pte;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
+ if (pte) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
/*
* Don't treat a reference through a sequentially read
@@ -884,8 +930,12 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
- pte_unmap_unlock(pte, ptl);
+ pte_unmap(pte);
+ } else {
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
}
+ spin_unlock(ptl);
if (referenced)
clear_page_idle(page);
@@ -933,7 +983,7 @@ int page_referenced(struct page *page,
int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
+ .mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
@@ -1122,7 +1172,7 @@ static void __page_check_anon_rmap(struct page *page,
* over the call to page_add_new_anon_rmap.
*/
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page->index != linear_page_index(vma, address));
+ BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
#endif
}
@@ -1131,6 +1181,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1138,9 +1189,9 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- do_page_add_anon_rmap(page, vma, address, 0);
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
@@ -1149,29 +1200,44 @@ void page_add_anon_rmap(struct page *page,
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+ struct vm_area_struct *vma, unsigned long address, int flags)
{
- int first = atomic_inc_and_test(&page->_mapcount);
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (compound) {
+ atomic_t *mapcount;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
if (first) {
+ int nr = compound ? hpage_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (PageTransHuge(page))
+ if (compound) {
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON_PAGE(!PageLocked(page), page);
+
/* address might be in next vma when migration races vma_adjust */
if (first)
- __page_set_anon_rmap(page, vma, address, exclusive);
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -1181,21 +1247,31 @@ void do_page_add_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
+ int nr = compound ? hpage_nr_pages(page) : 1;
+
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
- atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1223,12 +1299,15 @@ static void page_remove_file_rmap(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
goto out;
+ }
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page)))
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
/*
@@ -1245,41 +1324,76 @@ out:
mem_cgroup_end_page_stat(memcg);
}
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * themi are still mapped.
+ */
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ } else {
+ nr = HPAGE_PMD_NR;
+ }
+
+ if (nr) {
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ deferred_split_huge_page(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
{
if (!PageAnon(page)) {
+ VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
page_remove_file_rmap(page);
return;
}
+ if (compound)
+ return page_remove_anon_compound_rmap(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
+ __dec_zone_page_state(page, NR_ANON_PAGES);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1362,10 +1476,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHuge(page)) {
hugetlb_count_sub(1 << compound_order(page), mm);
} else {
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
@@ -1375,10 +1486,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
*/
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
swp_entry_t entry;
pte_t swp_pte;
@@ -1418,9 +1526,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
} else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
- page_remove_rmap(page);
+ page_remove_rmap(page, PageHuge(page));
page_cache_release(page);
out_unmap:
@@ -1702,7 +1810,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(&page->_mapcount);
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
@@ -1711,7 +1819,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0);
+ atomic_set(compound_mapcount_ptr(page), 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9187eee4128b..9b051115a100 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -360,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping,
}
/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given offsets are swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct page *page;
+ unsigned long swapped = 0;
+
+ rcu_read_lock();
+
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (iter.index >= end)
+ break;
+
+ page = radix_tree_deref_slot(slot);
+
+ /*
+ * This should only be possible to happen at index 0, so we
+ * don't need to reset the counter, nor do we risk infinite
+ * restarts.
+ */
+ if (radix_tree_deref_retry(page))
+ goto restart;
+
+ if (radix_tree_exceptional_entry(page))
+ swapped++;
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ start = iter.index + 1;
+ goto restart;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return swapped << PAGE_SHIFT;
+}
+
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long swapped;
+
+ /* Be careful as we don't hold info->lock */
+ swapped = READ_ONCE(info->swapped);
+
+ /*
+ * The easier cases are when the shmem object has nothing in swap, or
+ * the vma maps it whole. Then we can simply use the stats that we
+ * already track.
+ */
+ if (!swapped)
+ return 0;
+
+ if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+ return swapped << PAGE_SHIFT;
+
+ /* Here comes the more involved part */
+ return shmem_partial_swap_usage(mapping,
+ linear_page_index(vma, vma->vm_start),
+ linear_page_index(vma, vma->vm_end));
+}
+
+/*
* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
*/
void shmem_unlock_mapping(struct address_space *mapping)
@@ -729,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+ false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -752,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
if (error) {
if (error != -ENOMEM)
error = 0;
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
} else
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
page_cache_release(page);
@@ -1004,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1137,7 +1219,8 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
@@ -1154,14 +1237,14 @@ repeat:
* "repeat": reading a hole and writing should succeed.
*/
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
delete_from_swap_cache(page);
}
}
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
spin_lock(&info->lock);
info->swapped--;
@@ -1196,11 +1279,12 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1210,10 +1294,10 @@ repeat:
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
goto decused;
}
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -3109,7 +3193,7 @@ static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, SLAB_PANIC, shmem_init_inode);
+ 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
return 0;
}
diff --git a/mm/slab.h b/mm/slab.h
index 7b6087197997..c63b8699cfa3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3c6a86b4ec25..e016178063e1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index 46997517406e..b21fd24b08b1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -5362,6 +5364,8 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
+ if (s->flags & SLAB_ACCOUNT)
+ *p++ = 'A';
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..abffc33bb975 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -89,260 +89,14 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != compound_head(page), page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
- }
- return;
- }
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ __put_compound_page(page);
+ else
__put_single_page(page);
}
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
- unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
-
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
- }
- }
-
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
- }
- return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
@@ -604,6 +358,7 @@ static void __lru_cache_activate_page(struct page *page)
*/
void mark_page_accessed(struct page *page)
{
+ page = compound_head(page);
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
@@ -918,15 +673,6 @@ void release_pages(struct page **pages, int nr, bool cold)
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- put_compound_page(page);
- continue;
- }
-
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -937,9 +683,19 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ page = compound_head(page);
if (!put_page_testzero(page))
continue;
+ if (PageCompound(page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb7fa5f..d783872d746c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -353,7 +353,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -367,7 +367,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58877312cf6b..7073faecb38f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -929,6 +929,9 @@ int reuse_swap_page(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
+ /* The page is part of THP and cannot be reused */
+ if (PageTransCompound(page))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -1145,14 +1148,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1163,11 +1167,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, true);
+ page_add_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..806b0c758c5b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_uncharge_unlock;
inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
out_release:
page_cache_release(page);
goto out;
diff --git a/mm/util.c b/mm/util.c
index 9af1c12b310c..5be2a4bdf76b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,11 +400,25 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+ int ret;
+
+ ret = atomic_read(&page->_mapcount) + 1;
+ page = compound_head(page);
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ return ret;
}
+EXPORT_SYMBOL_GPL(__page_mapcount);
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e3c9c5a3042..58ceeb107960 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -441,8 +441,7 @@ nocache:
if (list_is_last(&first->list, &vmap_area_list))
goto found;
- first = list_entry(first->list.next,
- struct vmap_area, list);
+ first = list_next_entry(first, list);
}
found:
@@ -1477,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_page(page);
+ __free_kmem_pages(page, 0);
}
- if (area->flags & VM_VPAGES)
- vfree(area->pages);
- else
- kfree(area->pages);
+ kvfree(area->pages);
}
kfree(area);
@@ -1593,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, area->caller);
- area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -1608,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
+ page = alloc_kmem_pages(alloc_mask, order);
else
- page = alloc_pages_node(node, alloc_mask, order);
+ page = alloc_kmem_pages_node(node, alloc_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -2559,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ va = list_first_entry(&vmap_area_list, typeof(*va), list);
while (n > 0 && &va->list != &vmap_area_list) {
n--;
- va = list_entry(va->list.next, typeof(*va), list);
+ va = list_next_entry(va, list);
}
if (!n && &va->list != &vmap_area_list)
return va;
@@ -2576,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
struct vmap_area *va = p, *next;
++*pos;
- next = list_entry(va->list.next, typeof(*va), list);
+ next = list_next_entry(va, list);
if (&next->list != &vmap_area_list)
return next;
@@ -2651,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
- if (v->flags & VM_VPAGES)
+ if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..4c25e621a40b 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -38,7 +38,7 @@
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 2;
/*
* These thresholds are used when we account memory pressure through
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2aec4241b42a..4589cfdbe405 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -197,11 +197,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
unsigned long nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
+ zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
+ zone_page_state(zone, NR_INACTIVE_ANON) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
return nr;
}
@@ -594,7 +596,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
- trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
+ trace_mm_vmscan_writepage(page);
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -1184,7 +1186,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1691,11 +1693,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
- trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
- zone_idx(zone),
- nr_scanned, nr_reclaimed,
- sc->priority,
- trace_shrink_flags(file));
+ trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+ sc->priority, file);
return nr_reclaimed;
}
@@ -2046,10 +2045,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
}
/*
- * There is enough inactive page cache, do not reclaim
- * anything from the anonymous working set right now.
+ * If there is enough inactive page cache, i.e. if the size of the
+ * inactive list is greater than that of the active list *and* the
+ * inactive list actually has some pages to scan on this priority, we
+ * do not reclaim anything from the anonymous working set right now.
+ * Without the second condition we could end up never scanning an
+ * lruvec even if it has plenty of old anonymous pages unless the
+ * system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec)) {
+ if (!inactive_file_is_low(lruvec) &&
+ get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 879a2be23325..d13cd8eebf70 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -842,7 +842,9 @@ const char * const vmstat_text[] = {
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
- "thp_split",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
@@ -919,19 +921,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Reclaimable",
- "Movable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1128,7 +1117,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
@@ -1379,6 +1368,7 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
+static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
static cpumask_var_t cpu_stat_off;
@@ -1391,7 +1381,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- schedule_delayed_work_on(smp_processor_id(),
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
} else {
@@ -1460,7 +1450,7 @@ static void vmstat_shepherd(struct work_struct *w)
if (need_update(cpu) &&
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
- schedule_delayed_work_on(cpu,
+ queue_delayed_work_on(cpu, vmstat_wq,
&per_cpu(vmstat_work, cpu), 0);
put_online_cpus();
@@ -1549,6 +1539,7 @@ static int __init setup_vmstat(void)
start_shepherd_timer();
cpu_notifier_register_done();
+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 744e5936c10d..7aea0ccb6be6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->tn_bits <= TNODE_KMALLOC_MAX)
- kfree(n);
else
- vfree(n);
+ kvfree(n);
}
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 2379c1b4efb2..d07579ada001 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -48,7 +48,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
percpu_counter_destroy(&cg_proto->sockets_allocated);
- if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ if (cg_proto->active)
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -72,11 +72,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
sysctl_tcp_mem[i]);
- if (nr_pages == PAGE_COUNTER_MAX)
- clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- else {
+ if (!cg_proto->active) {
/*
- * The active bit needs to be written after the static_key
+ * The active flag needs to be written after the static_key
* update. This is what guarantees that the socket activation
* function is the last one to run. See sock_update_memcg() for
* details, and note that we don't mark any socket as belonging
@@ -90,14 +88,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
* We never race with the readers in sock_update_memcg(),
* because when this value change, the code to process it is not
* patched in yet.
- *
- * The activated bit is used to guarantee that no two writers
- * will do the update in the same memcg. Without that, we can't
- * properly shutdown the static key.
*/
- if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
- static_key_slow_inc(&memcg_socket_limit_enabled);
- set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ cg_proto->active = true;
}
return 0;
diff --git a/net/socket.c b/net/socket.c
index 456fadb3d819..abf24fb22c0c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -293,7 +293,7 @@ static int init_inodecache(void)
0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
init_once);
if (sock_inode_cachep == NULL)
return -ENOMEM;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d81186d34558..14f45bf0410c 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void)
rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
sizeof(struct rpc_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!rpc_inode_cachep)
return -ENOMEM;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2b3c22808c3b..d4960f7b353f 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -433,6 +433,28 @@ our @typeList = (
qr{${Ident}_handler_fn},
@typeListMisordered,
);
+
+our $C90_int_types = qr{(?x:
+ long\s+long\s+int\s+(?:un)?signed|
+ long\s+long\s+(?:un)?signed\s+int|
+ long\s+long\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?long\s+long\s+int|
+ (?:(?:un)?signed\s+)?long\s+long|
+ int\s+long\s+long\s+(?:un)?signed|
+ int\s+(?:(?:un)?signed\s+)?long\s+long|
+
+ long\s+int\s+(?:un)?signed|
+ long\s+(?:un)?signed\s+int|
+ long\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?long\s+int|
+ (?:(?:un)?signed\s+)?long|
+ int\s+long\s+(?:un)?signed|
+ int\s+(?:(?:un)?signed\s+)?long|
+
+ int\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?int
+)};
+
our @typeListFile = ();
our @typeListWithAttr = (
@typeList,
@@ -4517,7 +4539,7 @@ sub process {
#print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n";
$has_flow_statement = 1 if ($ctx =~ /\b(goto|return)\b/);
- $has_arg_concat = 1 if ($ctx =~ /\#\#/);
+ $has_arg_concat = 1 if ($ctx =~ /\#\#/ && $ctx !~ /\#\#\s*(?:__VA_ARGS__|args)\b/);
$dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//;
$dstat =~ s/$;//g;
@@ -4528,7 +4550,7 @@ sub process {
# Flatten any parentheses and braces
while ($dstat =~ s/\([^\(\)]*\)/1/ ||
$dstat =~ s/\{[^\{\}]*\}/1/ ||
- $dstat =~ s/\[[^\[\]]*\]/1/)
+ $dstat =~ s/.\[[^\[\]]*\]/1/)
{
}
@@ -4548,7 +4570,8 @@ sub process {
union|
struct|
\.$Ident\s*=\s*|
- ^\"|\"$
+ ^\"|\"$|
+ ^\[
}x;
#print "REST<$rest> dstat<$dstat> ctx<$ctx>\n";
if ($dstat ne '' &&
@@ -5241,6 +5264,26 @@ sub process {
}
}
+# check for cast of C90 native int or longer types constants
+ if ($line =~ /(\(\s*$C90_int_types\s*\)\s*)($Constant)\b/) {
+ my $cast = $1;
+ my $const = $2;
+ if (WARN("TYPECAST_INT_CONSTANT",
+ "Unnecessary typecast of c90 int constant\n" . $herecurr) &&
+ $fix) {
+ my $suffix = "";
+ my $newconst = $const;
+ $newconst =~ s/${Int_type}$//;
+ $suffix .= 'U' if ($cast =~ /\bunsigned\b/);
+ if ($cast =~ /\blong\s+long\b/) {
+ $suffix .= 'LL';
+ } elsif ($cast =~ /\blong\b/) {
+ $suffix .= 'L';
+ }
+ $fixed[$fixlinenr] =~ s/\Q$cast\E$const\b/$newconst$suffix/;
+ }
+ }
+
# check for sizeof(&)
if ($line =~ /\bsizeof\s*\(\s*\&/) {
WARN("SIZEOF_ADDRESS",
diff --git a/scripts/tags.sh b/scripts/tags.sh
index f69fa0c87312..3df83161da56 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -193,7 +193,6 @@ exuberant()
--regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/' \
--regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/' \
--regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \
- --regex-c++='/__TESTCLEARFLAG_FALSE\(([^,)]*).*/__TestClearPage\1/' \
--regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' \
--regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \
--regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \
@@ -261,7 +260,6 @@ emacs()
--regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \
--regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \
--regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \
- --regex='/__TESTCLEARFLAG_FALSE(\([^,)]*\).*/__TestClearPage\1/' \
--regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \
--regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \
--regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/' \