aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2015-11-16 13:40:07 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2015-11-16 13:40:07 +1100
commit2e7f56375b6746cf11fe2e0057f6a0039dd7aa2a (patch)
tree1c7ab67fcb01ae06e699625965022e6bcef25a41
parentf3553a26bbf95e21f9153b10d412f4e00701edcc (diff)
parentdcbefa17259c026e13990b06c3e884202b4ffc75 (diff)
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/ABI/testing/configfs-iio21
-rw-r--r--Documentation/features/vm/pmdp_splitting_flush/arch-support.txt40
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/iio/iio_configfs.txt93
-rw-r--r--Documentation/vm/transhuge.txt151
-rw-r--r--arch/arc/mm/cache.c4
-rw-r--r--arch/arm/include/asm/pgtable-3level.h9
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c5
-rw-r--r--arch/arm/mm/flush.c17
-rw-r--r--arch/arm64/include/asm/pgtable.h8
-rw-r--r--arch/arm64/mm/flush.c16
-rw-r--r--arch/mips/include/asm/pgtable-bits.h10
-rw-r--r--arch/mips/include/asm/pgtable.h18
-rw-r--r--arch/mips/mm/c-r4k.c3
-rw-r--r--arch/mips/mm/cache.c2
-rw-r--r--arch/mips/mm/gup.c17
-rw-r--r--arch/mips/mm/init.c6
-rw-r--r--arch/mips/mm/pgtable-64.c14
-rw-r--r--arch/mips/mm/tlbex.c1
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h27
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c17
-rw-r--r--arch/powerpc/mm/pgtable_64.c49
-rw-r--r--arch/powerpc/mm/subpage-prot.c2
-rw-r--r--arch/s390/include/asm/pgtable.h16
-rw-r--r--arch/s390/mm/gup.c24
-rw-r--r--arch/s390/mm/pgtable.c16
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/sh/mm/cache.c8
-rw-r--r--arch/sparc/include/asm/pgtable_64.h16
-rw-r--r--arch/sparc/mm/fault_64.c3
-rw-r--r--arch/sparc/mm/gup.c16
-rw-r--r--arch/tile/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/mm/gup.c17
-rw-r--r--arch/x86/mm/pgtable.c14
-rw-r--r--arch/xtensa/mm/tlb.c2
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/iio/Kconfig16
-rw-r--r--drivers/iio/Makefile2
-rw-r--r--drivers/iio/industrialio-configfs.c50
-rw-r--r--drivers/iio/industrialio-sw-trigger.c181
-rw-r--r--drivers/iio/trigger/Kconfig10
-rw-r--r--drivers/iio/trigger/Makefile2
-rw-r--r--drivers/iio/trigger/iio-trig-hrtimer.c193
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/configfs/dir.c110
-rw-r--r--fs/exofs/inode.c5
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c61
-rw-r--r--fs/fat/inode.c75
-rw-r--r--fs/hugetlbfs/inode.c44
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1120
-rw-r--r--fs/ocfs2/aops.h11
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c24
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/file.c138
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h16
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/super.c38
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/task_mmu.c55
-rw-r--r--include/asm-generic/dma-mapping-common.h2
-rw-r--r--include/asm-generic/pgtable.h9
-rw-r--r--include/linux/configfs.h10
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/dma-debug.h6
-rw-r--r--include/linux/huge_mm.h61
-rw-r--r--include/linux/iio/sw_trigger.h71
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/memcontrol.h16
-rw-r--r--include/linux/mm.h119
-rw-r--r--include/linux/mm_types.h20
-rw-r--r--include/linux/mmdebug.h6
-rw-r--r--include/linux/page-flags.h282
-rw-r--r--include/linux/pagemap.h38
-rw-r--r--include/linux/poison.h9
-rw-r--r--include/linux/rmap.h16
-rw-r--r--include/linux/slab.h43
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/trace/events/huge_memory.h166
-rw-r--r--ipc/msg.c5
-rw-r--r--kernel/events/uprobes.c11
-rw-r--r--kernel/futex.c63
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Kconfig.debug18
-rw-r--r--lib/Makefile1
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/dma-debug.c6
-rw-r--r--lib/string_helpers.c63
-rw-r--r--mm/debug.c8
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c114
-rw-r--r--mm/huge_memory.c1428
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h74
-rw-r--r--mm/ksm.c63
-rw-r--r--mm/memcontrol.c84
-rw-r--r--mm/memory-failure.c77
-rw-r--r--mm/memory.c77
-rw-r--r--mm/mempolicy.c40
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/page_alloc.c34
-rw-r--r--mm/page_idle.c66
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c12
-rw-r--r--mm/rmap.c268
-rw-r--r--mm/shmem.c25
-rw-r--r--mm/slab.c87
-rw-r--r--mm/slub.c264
-rw-r--r--mm/swap.c274
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/userfaultfd.c8
-rw-r--r--mm/util.c54
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c4
-rwxr-xr-xscripts/checkpatch.pl44
141 files changed, 4662 insertions, 3124 deletions
diff --git a/Documentation/ABI/testing/configfs-iio b/Documentation/ABI/testing/configfs-iio
new file mode 100644
index 000000000000..2483756fccf5
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-iio
@@ -0,0 +1,21 @@
+What: /config/iio
+Date: October 2015
+KernelVersion: 4.4
+Contact: linux-iio@vger.kernel.org
+Description:
+ This represents Industrial IO configuration entry point
+ directory. It contains sub-groups corresponding to IIO
+ objects.
+
+What: /config/iio/triggers
+Date: October 2015
+KernelVersion: 4.4
+Description:
+ Industrial IO software triggers directory.
+
+What: /config/iio/triggers/hrtimers
+Date: October 2015
+KernelVersion: 4.4
+Description:
+ High resolution timers directory. Creating a directory here
+ will result in creating a hrtimer trigger in the IIO subsystem.
diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644
index 26f74b457e0b..000000000000
--- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name: pmdp_splitting_flush
-# Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-# description: arch supports the pmdp_splitting_flush() VM API
-#
- -----------------------
- | arch |status|
- -----------------------
- | alpha: | TODO |
- | arc: | TODO |
- | arm: | ok |
- | arm64: | ok |
- | avr32: | TODO |
- | blackfin: | TODO |
- | c6x: | TODO |
- | cris: | TODO |
- | frv: | TODO |
- | h8300: | TODO |
- | hexagon: | TODO |
- | ia64: | TODO |
- | m32r: | TODO |
- | m68k: | TODO |
- | metag: | TODO |
- | microblaze: | TODO |
- | mips: | ok |
- | mn10300: | TODO |
- | nios2: | TODO |
- | openrisc: | TODO |
- | parisc: | TODO |
- | powerpc: | ok |
- | s390: | ok |
- | score: | TODO |
- | sh: | TODO |
- | sparc: | TODO |
- | tile: | TODO |
- | um: | TODO |
- | unicore32: | TODO |
- | x86: | ok |
- | xtensa: | TODO |
- -----------------------
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/iio/iio_configfs.txt b/Documentation/iio/iio_configfs.txt
new file mode 100644
index 000000000000..f0add35cd52e
--- /dev/null
+++ b/Documentation/iio/iio_configfs.txt
@@ -0,0 +1,93 @@
+Industrial IIO configfs support
+
+1. Overview
+
+Configfs is a filesystem-based manager of kernel objects. IIO uses some
+objects that could be easily configured using configfs (e.g.: devices,
+triggers).
+
+See Documentation/filesystems/configfs/configfs.txt for more information
+about how configfs works.
+
+2. Usage
+
+In order to use configfs support in IIO we need to select it at compile
+time via CONFIG_IIO_CONFIGFS config option.
+
+Then, mount the configfs filesystem (usually under /config directory):
+
+$ mkdir /config
+$ mount -t configfs none /config
+
+At this point, all default IIO groups will be created and can be accessed
+under /config/iio. Next chapters will describe available IIO configuration
+objects.
+
+3. Software triggers
+
+One of the IIO default configfs groups is the "triggers" group. It is
+automagically accessible when the configfs is mounted and can be found
+under /config/iio/triggers.
+
+IIO software triggers implementation offers support for creating multiple
+trigger types. A new trigger type is usually implemented as a separate
+kernel module following the interface in include/linux/iio/sw_trigger.h:
+
+/*
+ * drivers/iio/trigger/iio-trig-sample.c
+ * sample kernel module implementing a new trigger type
+ */
+#include <linux/iio/sw_trigger.h>
+
+
+static struct iio_sw_trigger *iio_trig_sample_probe(const char *name)
+{
+ /*
+ * This allocates and registers an IIO trigger plus other
+ * trigger type specific initialization.
+ */
+}
+
+static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
+{
+ /*
+ * This undoes the actions in iio_trig_sample_probe
+ */
+}
+
+static const struct iio_sw_trigger_ops iio_trig_sample_ops = {
+ .probe = iio_trig_sample_probe,
+ .remove = iio_trig_sample_remove,
+};
+
+static struct iio_sw_trigger_type iio_trig_sample = {
+ .name = "trig-sample",
+ .owner = THIS_MODULE,
+ .ops = &iio_trig_sample_ops,
+};
+
+module_iio_sw_trigger_driver(iio_trig_sample);
+
+Each trigger type has its own directory under /config/iio/triggers. Loading
+iio-trig-sample module will create 'trig-sample' trigger type directory
+/config/iio/triggers/trig-sample.
+
+We support the following interrupt sources (trigger types):
+ * hrtimer, uses high resolution timers as interrupt source
+
+3.1 Hrtimer triggers creation and destruction
+
+Loading iio-trig-hrtimer module will register hrtimer trigger types allowing
+users to create hrtimer triggers under /config/iio/triggers/hrtimer.
+
+e.g:
+
+$ mkdir /config/triggers/hrtimer/instance1
+$ rmdir /config/triggers/hrtimer/instance1
+
+Each trigger can have one or more attributes specific to the trigger type.
+
+3.2 "hrtimer" trigger types attributes
+
+"hrtimer" trigger type doesn't have any configurable attribute from /config dir.
+It does introduce the sampling_frequency attribute to trigger directory.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8a282687ee06..21cf34f3ddb2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -35,10 +35,10 @@ miss is going to run faster.
== Design ==
-- "graceful fallback": mm components which don't have transparent
- hugepage knowledge fall back to breaking a transparent hugepage and
- working on the regular pages and their respective regular pmd/pte
- mappings
+- "graceful fallback": mm components which don't have transparent hugepage
+ knowledge fall back to breaking huge pmd mapping into table of ptes and,
+ if necessary, split a transparent hugepage. Therefore these components
+ can continue working on the regular pages or regular pte mappings.
- if a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
of pages that should be collapsed into one huge page but failed
the allocation.
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.
+ This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed is is incremented if kernel fails to split huge
+ page. This can happen if the page was pinned by somebody.
+
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+ This can happen, for instance, when application calls mprotect() or
+ munmap() on part of huge page. It doesn't split huge page, only
+ page table entry.
thp_zero_page_alloc is incremented every time a huge zero page is
successfully allocated. It includes allocations which where
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
if any driver is going to mangle over the page structure of the tail
page (like for checking page->mapping or other bits that are relevant
for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
-split_huge_page() to avoid the head and tail pages to disappear from
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
NOTE: these aren't new constraints to the GUP API, and they match the
same constrains that applies to hugetlbfs too, so any driver capable
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==
Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
fallback design, with a one liner change, you can avoid to write
hundred if not thousand of lines of complex code to make your code
@@ -323,7 +330,8 @@ hugepage aware.
If you're not walking pagetables but you run into a physical hugepage
but you can't handle it natively in your code, you can split it by
calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
Example to make mremap.c transparent hugepage aware with a one liner
change:
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;
pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(vma, addr, pmd);
++ split_huge_pmd(vma, pmd, addr);
if (pmd_none_or_clear_bad(pmd))
return NULL;
== Locking in hugepage aware code ==
We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
To make pagetable walks huge pmd aware, all you need to do is to call
pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
takes the mmap_sem in write mode in addition to the anon_vma lock). If
pmd_trans_huge returns false, you just fallback in the old code
paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
-case pmd_trans_splitting returns true, it means split_huge_page is
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
-
-== compound_lock, get_user_pages and put_page ==
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+== Refcounts and transparent huge pages ==
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+ - get_page()/put_page() and GUP operate in head page's ->_count.
+
+ - ->_count in tail pages is always zero: get_page_unless_zero() never
+ succeed on tail pages.
+
+ - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+ on relevant sub-page of the compound page.
+
+ - map/unmap of the whole compound page accounted in compound_mapcount
+ (stored in first tail page).
+
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
-page structures. It can do that easily for refcounts taken by huge pmd
-mappings. But the GUI API as created by hugetlbfs (that returns head
-and tail pages if running get_user_pages on an address backed by any
-hugepage), requires the refcount to be accounted on the tail pages and
-not only in the head pages, if we want to be able to run
-split_huge_page while there are gup pins established on any tail
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
-get_user_pages which is unacceptable as too many gup users are
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
-hugetlbfs pages cannot be split so there wouldn't be requirement of
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
-anymore which tail page is pinned by gup and which is not while we run
-split_huge_page. But we still have to add the gup pin to the head page
-too, to know when we can free the compound page in case it's never
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
-page (and only on a tail page) it will find its respective head page,
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
-refcount without race conditions, put_page has to serialize against
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_count and
+page->_mapcount.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
+from geting reference to tail page up to the point. After the atomic_add()
+we don't care about ->_count value. We already known how many references
+with should uncharge from head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+== Partial unmap and deferred_split_huge_page() ==
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index ff7ff6cbb811..b65f797e9ad6 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
*/
if (!mapping_mapped(mapping)) {
clear_bit(PG_dc_clean, &page->flags);
- } else if (page_mapped(page)) {
+ } else if (page_mapcount(page)) {
/* kernel reading from page with U-mapping */
phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
* For !VIPT cache, all of this gets compiled out as
* addr_not_cache_congruent() is 0
*/
- if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+ if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
__flush_dcache_page((unsigned long)kfrom, u_vaddr);
clean_src_k_mappings = 1;
}
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..59d1457ca551 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -88,7 +88,6 @@
#define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0)
#define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING (_AT(pmdval_t, 1) << 56)
#define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57)
#define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58)
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif
#endif
#define PMD_BIT_FUNC(fn,op) \
@@ -246,7 +238,6 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index d72b90905132..96554afe1b83 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
*
* Lock the page table for the destination and check
* to see that it's still huge and whether or not we will
- * need to fault on write, or if we have a splitting THP.
+ * need to fault on write.
*/
if (unlikely(pmd_thp_or_huge(*pmd))) {
ptl = &current->mm->page_table_lock;
spin_lock(ptl);
if (unlikely(!pmd_thp_or_huge(*pmd)
- || pmd_hugewillfault(*pmd)
- || pmd_trans_splitting(*pmd))) {
+ || pmd_hugewillfault(*pmd))) {
spin_unlock(ptl);
return 0;
}
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1ec8e7590fc6..d0ba3551d49a 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
mapping = page_mapping(page);
if (!cache_ops_need_broadcast() &&
- mapping && !page_mapped(page))
+ mapping && !page_mapcount(page))
clear_bit(PG_dcache_clean, &page->flags);
else {
__flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
*/
__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9819a9426b69..fe2943396fdf 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -332,19 +332,11 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c26b804015e8..3d59de89c042 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -99,19 +99,3 @@ EXPORT_SYMBOL(flush_dcache_page);
* Additional functions defined in assembly.
*/
EXPORT_SYMBOL(flush_icache_range);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
-
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h
index ff7ad91c85db..97b313882678 100644
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -131,14 +131,12 @@
/* Huge TLB page */
#define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING (1 << _PAGE_SPLITTING_SHIFT)
#endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
/* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
-#define _PAGE_NO_EXEC_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#ifdef _PAGE_HUGE_SHIFT
+#define _PAGE_NO_EXEC_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
@@ -153,8 +151,8 @@
#if defined(_PAGE_NO_READ_SHIFT)
#define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
-#define _PAGE_GLOBAL_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#elif defined(_PAGE_HUGE_SHIFT)
+#define _PAGE_GLOBAL_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 8957f15e21ec..6995b4a02e23 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
return pmd;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
-
- return pmd;
-}
-
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5d3a25e1cfae..caac3d747a90 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
* another ASID than the current one.
*/
map_coherent = (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page));
+ page_mapcount(page) &&
+ !Page_dcache_dirty(page));
if (map_coherent)
vaddr = kmap_coherent(page, addr);
else
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index aab218c36e0d..3f159caf6dbc 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (page_mapped(page) && !Page_dcache_dirty(page)) {
+ if (page_mapcount(page) && !Page_dcache_dirty(page)) {
void *kaddr;
kaddr = kmap_coherent(page, vmaddr);
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 349995d19c7f..1afd87c999b0 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_huge(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 8770e619185e..7e5fa0938c21 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
vto = kmap_atomic(to);
if (cpu_has_dc_aliases &&
- page_mapped(from) && !Page_dcache_dirty(from)) {
+ page_mapcount(from) && !Page_dcache_dirty(from)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
kunmap_coherent();
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index e8adc0069d66..ce4473e7c0d2 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
}
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
-{
- if (!pmd_trans_splitting(*pmdp)) {
- pmd_t pmd = pmd_mksplitting(*pmdp);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- }
-}
-
-#endif
-
pmd_t mk_pmd(struct page *page, pgprot_t prot)
{
pmd_t pmd;
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 32e0be27673f..482192cc8f2b 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
- pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
#endif
#ifdef CONFIG_CPU_MIPSR2
if (cpu_has_rixi) {
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3245f2d96d4f..0db2a3f8e554 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -374,11 +374,6 @@ void pgtable_cache_init(void);
#endif /* __ASSEMBLY__ */
/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
-/*
* We need to differentiate between explicit huge page and THP huge
* page, since THP huge page also need to track real subpage details
*/
@@ -387,9 +382,8 @@ void pgtable_cache_init(void);
/*
* set of bits not changed in pmd_modify.
*/
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \
- _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
- _PAGE_THP_HUGE)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | _PAGE_THP_HUGE)
#ifndef __ASSEMBLY__
/*
@@ -471,13 +465,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- if (pmd_trans_huge(pmd))
- return pmd_val(pmd) & _PAGE_SPLITTING;
- return 0;
-}
-
extern int has_transparent_hugepage(void);
#else
static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
@@ -536,12 +523,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
return pmd;
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
- return pmd;
-}
-
#define __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
@@ -592,10 +573,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#define pmdp_collapse_flush pmdp_collapse_flush
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 4d87122cf6a7..6ffade530dab 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
- /* If PMD is trans splitting retry the access */
- if (unlikely(old_pmd & _PAGE_SPLITTING))
- return 0;
/* If PMD permissions don't match, take page fault */
if (unlikely(access & ~old_pmd))
return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9833fee493ec..cd2d82efe1cd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1030,10 +1030,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
/*
* A hugepage collapse is captured by pmd_none, because
* it mark the pmd none and do a hpte invalidate.
- *
- * We don't worry about pmd_trans_splitting here, The
- * caller if it needs to handle the splitting case
- * should check for that.
*/
if (pmd_none(pmd))
return NULL;
@@ -1071,7 +1067,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
{
unsigned long mask;
unsigned long pte_end;
- struct page *head, *page, *tail;
+ struct page *head, *page;
pte_t pte;
int refs;
@@ -1094,7 +1090,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -1116,15 +1111,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e92cb2146b18..422c59a24561 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -604,55 +604,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
}
/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- unsigned long old, tmp;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- ori %1,%0,%4 \n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
- /*
- * If we didn't had the splitting flag set, go and flush the
- * HPTE entries.
- */
- trace_hugepage_splitting(address, old);
- if (!(old & _PAGE_SPLITTING)) {
- /* We need to flush the hpte */
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
- }
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- */
- kick_all_cpus_sync();
-}
-
-/*
* We want to put the pgtable in pmd and use pgtable for tracking
* the base page size hptes
*/
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index fa9fb5b4c66c..d5543514c1df 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
return 0;
}
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 024f85f947ae..64ead8091248 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT 0x0800 /* THP splitting bit */
#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */
#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
* SW-bits: y young, d dirty, r read, w write
*/
-#define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */
-
/* Page status table bits for virtualization */
#define PGSTE_ACC_BITS 0xf000000000000000UL
#define PGSTE_FP_BIT 0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
if (pmd_large(pmd)) {
pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
- _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
- _SEGMENT_ENTRY_SOFT_DIRTY;
+ _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
pmd_val(pmd) |= massage_pgprot_pmd(newprot);
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
#define __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
- (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
-
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 12bbf0e8478f..79f09170943c 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask, result;
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
pmd = *pmdp;
barrier();
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush() has to serialize with
- * smp_call_function() against our disabled IRQs, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
/*
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc01b43..34f3790fe459 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1308,22 +1308,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
return 1;
}
-static void pmdp_splitting_flush_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
- (unsigned long *) pmdp)) {
- /* need to serialize against gup-fast (IRQ disabled) */
- smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
- }
-}
-
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 51d8f7f31d1d..58aaa4f33b81 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
*/
map_coherent = (current_cpu_data.dcache.n_aliases &&
test_bit(PG_dcache_clean, &page->flags) &&
- page_mapped(page));
+ page_mapcount(page));
if (map_coherent)
vaddr = kmap_coherent(page, address);
else
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index f770e3992620..e58cfbf45150 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
vto = kmap_atomic(to);
- if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
test_bit(PG_dcache_clean, &from->flags)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *kaddr;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..f5bfcd66aeb5 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
- pte_t pte = __pte(pmd_val(pmd));
-
- return pmd_trans_huge(pmd) && pte_special(pte);
-}
-
#define has_transparent_hugepage() 1
static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -735,15 +728,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pte_t pte = __pte(pmd_val(pmd));
-
- pte = pte_mkspecial(pte);
-
- return __pmd(pte_val(pte));
-}
-
static inline pgprot_t pmd_pgprot(pmd_t entry)
{
unsigned long val = pmd_val(entry);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index dbabe5713a15..cb841a33da59 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmdp)) {
- if (pmd_trans_splitting(*pmdp))
- goto out_irq_enable;
-
pa = pmd_pfn(*pmdp) << PAGE_SHIFT;
pa += tpc & ~HPAGE_MASK;
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 2e5c4fc2daa9..eb3d8e8ebc6b 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
put_page(head);
return 0;
}
- if (head != page)
- get_huge_page_tail(page);
pages[*nr] = page;
(*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages,
int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
return 0;
}
- /* Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 2b05ccbebed9..96cecf55522e 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#define pmd_trans_huge pmd_huge_page
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return hv_pte_get_client2(pmd_pte(pmd));
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..a8d1aa3a43b0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -165,11 +165,6 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
static inline int pmd_trans_huge(pmd_t pmd)
{
return pmd_val(pmd) & _PAGE_PSE;
@@ -816,10 +811,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index dd5b0aa9dd2f..116fc4ee586f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,7 +22,6 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
@@ -46,7 +45,6 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..22db575a2fec 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..3a5330213aca 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+ if (pmd_trans_huge(*pmd)) {
+ struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..f8cb3e8ac250 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -136,8 +136,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -158,18 +156,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
@@ -212,8 +199,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..f52caf9c519b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -509,20 +509,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
return young;
}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- int set;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
- (unsigned long *)pmdp);
- if (set) {
- pmd_update(vma->vm_mm, address, pmdp);
- /* need tlb flush only to serialize against gup-fast */
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
-}
#endif
/**
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index 5ece856c5725..35c822286bbe 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
page_mapcount(p));
if (!page_count(p))
rc |= TLB_INSANE;
- else if (page_mapped(p))
+ else if (page_mapcount(p))
rc |= TLB_SUSPICIOUS;
} else {
rc |= TLB_INSANE;
diff --git a/block/genhd.c b/block/genhd.c
index e5cafa51567c..ebb41feea357 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,7 +852,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index 66792e707d74..5dfb4f1c348c 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -22,6 +22,14 @@ if IIO_BUFFER
source "drivers/iio/buffer/Kconfig"
endif # IIO_BUFFER
+config IIO_CONFIGFS
+ tristate "Enable IIO configuration via configfs"
+ select CONFIGFS_FS
+ help
+ This allows configuring various IIO bits through configfs
+ (e.g. software triggers). For more info see
+ Documentation/iio/iio_configfs.txt.
+
config IIO_TRIGGER
bool "Enable triggered sampling support"
help
@@ -38,6 +46,14 @@ config IIO_CONSUMERS_PER_TRIGGER
This value controls the maximum number of consumers that a
given trigger may handle. Default is 2.
+config IIO_SW_TRIGGER
+ tristate "Enable software triggers support"
+ select IIO_CONFIGFS
+ help
+ Provides IIO core support for software triggers. A software
+ trigger can be created via configfs or directly by a driver
+ using the API provided.
+
config IIO_TRIGGERED_EVENT
tristate
select IIO_TRIGGER
diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
index aeca7269fe44..dc73c1f1583f 100644
--- a/drivers/iio/Makefile
+++ b/drivers/iio/Makefile
@@ -7,6 +7,8 @@ industrialio-y := industrialio-core.o industrialio-event.o inkern.o
industrialio-$(CONFIG_IIO_BUFFER) += industrialio-buffer.o
industrialio-$(CONFIG_IIO_TRIGGER) += industrialio-trigger.o
+obj-$(CONFIG_IIO_CONFIGFS) += industrialio-configfs.o
+obj-$(CONFIG_IIO_SW_TRIGGER) += industrialio-sw-trigger.o
obj-$(CONFIG_IIO_TRIGGERED_EVENT) += industrialio-triggered-event.o
obj-y += accel/
diff --git a/drivers/iio/industrialio-configfs.c b/drivers/iio/industrialio-configfs.c
new file mode 100644
index 000000000000..83563dd7fcf4
--- /dev/null
+++ b/drivers/iio/industrialio-configfs.c
@@ -0,0 +1,50 @@
+/*
+ * Industrial I/O configfs bits
+ *
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/configfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+
+#include <linux/iio/iio.h>
+
+static struct config_item_type iio_root_group_type = {
+ .ct_owner = THIS_MODULE,
+};
+
+struct configfs_subsystem iio_configfs_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "iio",
+ .ci_type = &iio_root_group_type,
+ },
+ },
+ .su_mutex = __MUTEX_INITIALIZER(iio_configfs_subsys.su_mutex),
+};
+EXPORT_SYMBOL(iio_configfs_subsys);
+
+static int __init iio_configfs_init(void)
+{
+ config_group_init(&iio_configfs_subsys.su_group);
+
+ return configfs_register_subsystem(&iio_configfs_subsys);
+}
+module_init(iio_configfs_init);
+
+static void __exit iio_configfs_exit(void)
+{
+ configfs_unregister_subsystem(&iio_configfs_subsys);
+}
+module_exit(iio_configfs_exit);
+
+MODULE_AUTHOR("Daniel Baluta <daniel.baluta@intel.com>");
+MODULE_DESCRIPTION("Industrial I/O configfs support");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/industrialio-sw-trigger.c b/drivers/iio/industrialio-sw-trigger.c
new file mode 100644
index 000000000000..22262f68db6e
--- /dev/null
+++ b/drivers/iio/industrialio-sw-trigger.c
@@ -0,0 +1,181 @@
+/*
+ * The Industrial I/O core, software trigger functions
+ *
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <linux/iio/sw_trigger.h>
+#include <linux/configfs.h>
+
+static struct config_group *iio_triggers_group;
+static struct config_item_type iio_trigger_type_group_type;
+
+static struct config_item_type iio_triggers_group_type = {
+ .ct_owner = THIS_MODULE,
+};
+
+static LIST_HEAD(iio_trigger_types_list);
+static DEFINE_MUTEX(iio_trigger_types_lock);
+
+static
+struct iio_sw_trigger_type *__iio_find_sw_trigger_type(const char *name,
+ unsigned len)
+{
+ struct iio_sw_trigger_type *t = NULL, *iter;
+
+ list_for_each_entry(iter, &iio_trigger_types_list, list)
+ if (!strcmp(iter->name, name)) {
+ t = iter;
+ break;
+ }
+
+ return t;
+}
+
+int iio_register_sw_trigger_type(struct iio_sw_trigger_type *t)
+{
+ struct iio_sw_trigger_type *iter;
+ int ret = 0;
+
+ mutex_lock(&iio_trigger_types_lock);
+ iter = __iio_find_sw_trigger_type(t->name, strlen(t->name));
+ if (iter)
+ ret = -EBUSY;
+ else
+ list_add_tail(&t->list, &iio_trigger_types_list);
+ mutex_unlock(&iio_trigger_types_lock);
+
+ if (ret)
+ return ret;
+
+ t->group = configfs_register_default_group(iio_triggers_group, t->name,
+ &iio_trigger_type_group_type);
+ if (IS_ERR(t->group))
+ ret = PTR_ERR(t->group);
+
+ return ret;
+}
+EXPORT_SYMBOL(iio_register_sw_trigger_type);
+
+void iio_unregister_sw_trigger_type(struct iio_sw_trigger_type *t)
+{
+ struct iio_sw_trigger_type *iter;
+
+ mutex_lock(&iio_trigger_types_lock);
+ iter = __iio_find_sw_trigger_type(t->name, strlen(t->name));
+ if (iter)
+ list_del(&t->list);
+ mutex_unlock(&iio_trigger_types_lock);
+
+ configfs_unregister_default_group(t->group);
+}
+EXPORT_SYMBOL(iio_unregister_sw_trigger_type);
+
+static
+struct iio_sw_trigger_type *iio_get_sw_trigger_type(const char *name)
+{
+ struct iio_sw_trigger_type *t;
+
+ mutex_lock(&iio_trigger_types_lock);
+ t = __iio_find_sw_trigger_type(name, strlen(name));
+ if (t && !try_module_get(t->owner))
+ t = NULL;
+ mutex_unlock(&iio_trigger_types_lock);
+
+ return t;
+}
+
+struct iio_sw_trigger *iio_sw_trigger_create(const char *type, const char *name)
+{
+ struct iio_sw_trigger *t;
+ struct iio_sw_trigger_type *tt;
+
+ tt = iio_get_sw_trigger_type(type);
+ if (!tt) {
+ pr_err("Invalid trigger type: %s\n", type);
+ return ERR_PTR(-EINVAL);
+ }
+ t = tt->ops->probe(name);
+ if (IS_ERR(t))
+ goto out_module_put;
+
+ t->trigger_type = tt;
+
+ return t;
+out_module_put:
+ module_put(tt->owner);
+ return t;
+}
+EXPORT_SYMBOL(iio_sw_trigger_create);
+
+void iio_sw_trigger_destroy(struct iio_sw_trigger *t)
+{
+ struct iio_sw_trigger_type *tt = t->trigger_type;
+
+ tt->ops->remove(t);
+ module_put(tt->owner);
+}
+EXPORT_SYMBOL(iio_sw_trigger_destroy);
+
+static struct config_group *trigger_make_group(struct config_group *group,
+ const char *name)
+{
+ struct iio_sw_trigger *t;
+
+ t = iio_sw_trigger_create(group->cg_item.ci_name, name);
+ if (IS_ERR(t))
+ return ERR_CAST(t);
+
+ config_item_set_name(&t->group.cg_item, "%s", name);
+
+ return &t->group;
+}
+
+static void trigger_drop_group(struct config_group *group,
+ struct config_item *item)
+{
+ struct iio_sw_trigger *t = to_iio_sw_trigger(item);
+
+ iio_sw_trigger_destroy(t);
+ config_item_put(item);
+}
+
+static struct configfs_group_operations trigger_ops = {
+ .make_group = &trigger_make_group,
+ .drop_item = &trigger_drop_group,
+};
+
+static struct config_item_type iio_trigger_type_group_type = {
+ .ct_group_ops = &trigger_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static int __init iio_sw_trigger_init(void)
+{
+ iio_triggers_group =
+ configfs_register_default_group(&iio_configfs_subsys.su_group,
+ "triggers",
+ &iio_triggers_group_type);
+ return PTR_ERR_OR_ZERO(iio_triggers_group);
+}
+module_init(iio_sw_trigger_init);
+
+static void __exit iio_sw_trigger_exit(void)
+{
+ configfs_unregister_default_group(iio_triggers_group);
+}
+module_exit(iio_sw_trigger_exit);
+
+MODULE_AUTHOR("Daniel Baluta <daniel.baluta@intel.com>");
+MODULE_DESCRIPTION("Industrial I/O software triggers support");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
index 79996123a71b..519e6772f6f5 100644
--- a/drivers/iio/trigger/Kconfig
+++ b/drivers/iio/trigger/Kconfig
@@ -5,6 +5,16 @@
menu "Triggers - standalone"
+config IIO_HRTIMER_TRIGGER
+ tristate "High resolution timer trigger"
+ depends on IIO_SW_TRIGGER
+ help
+ Provides a frequency based IIO trigger using high resolution
+ timers as interrupt source.
+
+ To compile this driver as a module, choose M here: the
+ module will be called iio-trig-hrtimer.
+
config IIO_INTERRUPT_TRIGGER
tristate "Generic interrupt trigger"
help
diff --git a/drivers/iio/trigger/Makefile b/drivers/iio/trigger/Makefile
index 0694daecaf22..fe06eb564367 100644
--- a/drivers/iio/trigger/Makefile
+++ b/drivers/iio/trigger/Makefile
@@ -3,5 +3,7 @@
#
# When adding new entries keep the list in alphabetical order
+
+obj-$(CONFIG_IIO_HRTIMER_TRIGGER) += iio-trig-hrtimer.o
obj-$(CONFIG_IIO_INTERRUPT_TRIGGER) += iio-trig-interrupt.o
obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c
new file mode 100644
index 000000000000..5e6d451febeb
--- /dev/null
+++ b/drivers/iio/trigger/iio-trig-hrtimer.c
@@ -0,0 +1,193 @@
+/**
+ * The industrial I/O periodic hrtimer trigger driver
+ *
+ * Copyright (C) Intuitive Aerial AB
+ * Written by Marten Svanfeldt, marten@intuitiveaerial.com
+ * Copyright (C) 2012, Analog Device Inc.
+ * Author: Lars-Peter Clausen <lars@metafoo.de>
+ * Copyright (C) 2015, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/hrtimer.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger.h>
+#include <linux/iio/sw_trigger.h>
+
+/* default sampling frequency - 100Hz */
+#define HRTIMER_DEFAULT_SAMPLING_FREQUENCY 100
+
+struct iio_hrtimer_info {
+ struct iio_sw_trigger swt;
+ struct hrtimer timer;
+ unsigned long sampling_frequency;
+ ktime_t period;
+};
+
+static struct config_item_type iio_hrtimer_type = {
+ .ct_owner = THIS_MODULE,
+};
+
+static
+ssize_t iio_hrtimer_show_sampling_frequency(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct iio_trigger *trig = to_iio_trigger(dev);
+ struct iio_hrtimer_info *info = iio_trigger_get_drvdata(trig);
+
+ return snprintf(buf, PAGE_SIZE, "%lu\n", info->sampling_frequency);
+}
+
+static
+ssize_t iio_hrtimer_store_sampling_frequency(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct iio_trigger *trig = to_iio_trigger(dev);
+ struct iio_hrtimer_info *info = iio_trigger_get_drvdata(trig);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+
+ if (!val || val > NSEC_PER_SEC)
+ return -EINVAL;
+
+ info->sampling_frequency = val;
+ info->period = ktime_set(0, NSEC_PER_SEC / val);
+
+ return len;
+}
+
+static DEVICE_ATTR(sampling_frequency, S_IRUGO | S_IWUSR,
+ iio_hrtimer_show_sampling_frequency,
+ iio_hrtimer_store_sampling_frequency);
+
+static struct attribute *iio_hrtimer_attrs[] = {
+ &dev_attr_sampling_frequency.attr,
+ NULL
+};
+
+static const struct attribute_group iio_hrtimer_attr_group = {
+ .attrs = iio_hrtimer_attrs,
+};
+
+static const struct attribute_group *iio_hrtimer_attr_groups[] = {
+ &iio_hrtimer_attr_group,
+ NULL
+};
+
+static enum hrtimer_restart iio_hrtimer_trig_handler(struct hrtimer *timer)
+{
+ struct iio_hrtimer_info *info;
+
+ info = container_of(timer, struct iio_hrtimer_info, timer);
+
+ hrtimer_forward_now(timer, info->period);
+ iio_trigger_poll(info->swt.trigger);
+
+ return HRTIMER_RESTART;
+}
+
+static int iio_trig_hrtimer_set_state(struct iio_trigger *trig, bool state)
+{
+ struct iio_hrtimer_info *trig_info;
+
+ trig_info = iio_trigger_get_drvdata(trig);
+
+ if (state)
+ hrtimer_start(&trig_info->timer, trig_info->period,
+ HRTIMER_MODE_REL);
+ else
+ hrtimer_cancel(&trig_info->timer);
+
+ return 0;
+}
+
+static const struct iio_trigger_ops iio_hrtimer_trigger_ops = {
+ .owner = THIS_MODULE,
+ .set_trigger_state = iio_trig_hrtimer_set_state,
+};
+
+static struct iio_sw_trigger *iio_trig_hrtimer_probe(const char *name)
+{
+ struct iio_hrtimer_info *trig_info;
+ int ret;
+
+ trig_info = kzalloc(sizeof(*trig_info), GFP_KERNEL);
+ if (!trig_info)
+ return ERR_PTR(-ENOMEM);
+
+ trig_info->swt.trigger = iio_trigger_alloc("%s", name);
+ if (!trig_info->swt.trigger) {
+ ret = -ENOMEM;
+ goto err_free_trig_info;
+ }
+
+ iio_trigger_set_drvdata(trig_info->swt.trigger, trig_info);
+ trig_info->swt.trigger->ops = &iio_hrtimer_trigger_ops;
+ trig_info->swt.trigger->dev.groups = iio_hrtimer_attr_groups;
+
+ hrtimer_init(&trig_info->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ trig_info->timer.function = iio_hrtimer_trig_handler;
+
+ trig_info->sampling_frequency = HRTIMER_DEFAULT_SAMPLING_FREQUENCY;
+ trig_info->period = ktime_set(0, NSEC_PER_SEC /
+ trig_info->sampling_frequency);
+
+ ret = iio_trigger_register(trig_info->swt.trigger);
+ if (ret)
+ goto err_free_trigger;
+
+ iio_swt_group_init_type_name(&trig_info->swt, name, &iio_hrtimer_type);
+ return &trig_info->swt;
+err_free_trigger:
+ iio_trigger_free(trig_info->swt.trigger);
+err_free_trig_info:
+ kfree(trig_info);
+
+ return ERR_PTR(ret);
+}
+
+static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
+{
+ struct iio_hrtimer_info *trig_info;
+
+ trig_info = iio_trigger_get_drvdata(swt->trigger);
+
+ iio_trigger_unregister(swt->trigger);
+
+ /* cancel the timer after unreg to make sure no one rearms it */
+ hrtimer_cancel(&trig_info->timer);
+ iio_trigger_free(swt->trigger);
+ kfree(trig_info);
+
+ return 0;
+}
+
+static const struct iio_sw_trigger_ops iio_trig_hrtimer_ops = {
+ .probe = iio_trig_hrtimer_probe,
+ .remove = iio_trig_hrtimer_remove,
+};
+
+static struct iio_sw_trigger_type iio_trig_hrtimer = {
+ .name = "hrtimer",
+ .owner = THIS_MODULE,
+ .ops = &iio_trig_hrtimer_ops,
+};
+
+module_iio_sw_trigger_driver(iio_trig_hrtimer);
+
+MODULE_AUTHOR("Marten Svanfeldt <marten@intuitiveaerial.com>");
+MODULE_AUTHOR("Daniel Baluta <daniel.baluta@intel.com>");
+MODULE_DESCRIPTION("Periodic hrtimer trigger for the IIO subsystem");
+MODULE_LICENSE("GPL v2");
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e82217c3..0a2752b79e72 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c81ce7f200a6..a7a1b218f308 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1636,6 +1636,116 @@ const struct file_operations configfs_dir_operations = {
.iterate = configfs_readdir,
};
+/**
+ * configfs_register_group - creates a parent-child relation between two groups
+ * @parent_group: parent group
+ * @group: child group
+ *
+ * link groups, creates dentry for the child and attaches it to the
+ * parent dentry.
+ *
+ * Return: 0 on success, negative errno code on error
+ */
+int configfs_register_group(struct config_group *parent_group,
+ struct config_group *group)
+{
+ struct configfs_subsystem *subsys = parent_group->cg_subsys;
+ struct dentry *parent;
+ int ret;
+
+ mutex_lock(&subsys->su_mutex);
+ link_group(parent_group, group);
+ mutex_unlock(&subsys->su_mutex);
+
+ parent = parent_group->cg_item.ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ ret = create_default_group(parent_group, group);
+ if (!ret) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ mutex_unlock(&d_inode(parent)->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(configfs_register_group);
+
+/**
+ * configfs_unregister_group() - unregisters a child group from its parent
+ * @group: parent group to be unregistered
+ *
+ * Undoes configfs_register_group()
+ */
+void configfs_unregister_group(struct config_group *group)
+{
+ struct configfs_subsystem *subsys = group->cg_subsys;
+ struct dentry *dentry = group->cg_item.ci_dentry;
+ struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_prep(dentry, NULL);
+ spin_unlock(&configfs_dirent_lock);
+
+ configfs_detach_group(&group->cg_item);
+ d_inode(dentry)->i_flags |= S_DEAD;
+ dont_mount(dentry);
+ d_delete(dentry);
+ mutex_unlock(&d_inode(parent)->i_mutex);
+
+ dput(dentry);
+
+ mutex_lock(&subsys->su_mutex);
+ unlink_group(group);
+ mutex_unlock(&subsys->su_mutex);
+}
+EXPORT_SYMBOL(configfs_unregister_group);
+
+/**
+ * configfs_register_default_group() - allocates and registers a child group
+ * @parent_group: parent group
+ * @name: child group name
+ * @item_type: child item type description
+ *
+ * boilerplate to allocate and register a child group with its parent. We need
+ * kzalloc'ed memory because child's default_group is initially empty.
+ *
+ * Return: allocated config group or ERR_PTR() on error
+ */
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+ const char *name,
+ struct config_item_type *item_type)
+{
+ int ret;
+ struct config_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+ config_group_init_type_name(group, name, item_type);
+
+ ret = configfs_register_group(parent_group, group);
+ if (ret) {
+ kfree(group);
+ return ERR_PTR(ret);
+ }
+ return group;
+}
+EXPORT_SYMBOL(configfs_register_default_group);
+
+/**
+ * configfs_unregister_default_group() - unregisters and frees a child group
+ * @group: the group to act on
+ */
+void configfs_unregister_default_group(struct config_group *group)
+{
+ configfs_unregister_group(group);
+ kfree(group);
+}
+EXPORT_SYMBOL(configfs_unregister_default_group);
+
int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64daa0f55..60f03b78914e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae..7002467bfbac 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e41..4c71c8c76426 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..4307cd4f8da0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..43d3475da83a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,8 +14,12 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..d04c87da4255 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -553,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316adb968b65..30cf53435087 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -368,10 +368,25 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
lookup_nr = end - next;
/*
- * This pagevec_lookup() may return pages past 'end',
- * so we must check for page->index > end.
+ * When no more pages are found, take different action for
+ * hole punch and truncate.
+ *
+ * For hole punch, this indicates we have removed each page
+ * within the range and are done. Note that pages may have
+ * been faulted in after being removed in the hole punch case.
+ * This is OK as long as each page in the range was removed
+ * once.
+ *
+ * For truncate, we need to make sure all pages within the
+ * range are removed when exiting this routine. We could
+ * have raced with a fault that brought in a page after it
+ * was first removed. Check the range again until no pages
+ * are found.
*/
if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
+ if (!truncate_op)
+ break;
+
if (next == start)
break;
next = start;
@@ -382,19 +397,23 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
struct page *page = pvec.pages[i];
u32 hash;
+ /*
+ * The page (index) could be beyond end. This is
+ * only possible in the punch hole case as end is
+ * LLONG_MAX for truncate.
+ */
+ if (page->index >= end) {
+ next = end; /* we are done */
+ break;
+ }
+ next = page->index;
+
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
mapping, next, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
- if (page->index >= end) {
- unlock_page(page);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- next = end; /* we are done */
- break;
- }
-
/*
* If page is mapped, it was faulted in after being
* unmapped. Do nothing in this race case. In the
@@ -423,15 +442,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
}
- if (page->index > next)
- next = page->index;
-
++next;
unlock_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
+ cond_resched();
}
if (truncate_op)
@@ -647,9 +664,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode->i_ctime = CURRENT_TIME;
- spin_lock(&inode->i_lock);
- inode->i_private = NULL;
- spin_unlock(&inode->i_lock);
out:
mutex_unlock(&inode->i_mutex);
return error;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5c0c6b58157f..9aebffb40505 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
return page;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..c2cb51d4f6e9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
@@ -6254,7 +6277,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..4bb992145385 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- set_buffer_new(bh_result);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
@@ -653,362 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ret = is_overwrite;
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
@@ -1195,6 +692,13 @@ next_bh:
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
+
/*
* Describe the state of a single cluster to be written to.
*/
@@ -1206,7 +710,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -1218,6 +722,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -1266,6 +773,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1304,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
@@ -1313,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -1328,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
@@ -1335,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -1395,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1530,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
@@ -1553,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1577,19 +1114,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1599,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1618,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1630,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
@@ -1700,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1772,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
}
/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
+}
+
+/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
*
@@ -1846,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -2016,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -2071,9 +1682,8 @@ out:
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
@@ -2090,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -2109,14 +1719,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2147,7 +1760,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
@@ -2177,17 +1790,17 @@ try_again:
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -2254,7 +1867,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
@@ -2265,7 +1879,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
@@ -2317,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2375,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2388,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
@@ -2418,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2446,7 +2073,8 @@ out:
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2471,6 +2099,340 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..d06b80f58f83 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..f90931335c6b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -262,6 +262,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +288,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +330,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..8caf88160942 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..05346fb8d5fd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1373,44 +1373,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -2121,18 +2083,12 @@ out:
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2181,10 +2137,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
@@ -2192,67 +2144,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2264,18 +2161,16 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
- int dropped_dio = 0;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
@@ -2288,12 +2183,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2326,7 +2219,6 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
@@ -2335,9 +2227,7 @@ relock:
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2346,22 +2236,6 @@ relock:
if (direct_io && !is_sync_kiocb(iocb))
unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
if (unaligned_dio) {
/*
* Wait on previous unaligned aio to complete before
@@ -2397,7 +2271,7 @@ relock:
goto no_sync;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05ee25d..0fd9ebdd3ed8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1125,6 +1125,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..0c22ddd4b0dd 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -57,6 +57,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..af0c6b36cc66 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1327,7 +1327,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1972,7 +1972,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -2012,7 +2012,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..3e1931273268 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -387,7 +387,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -1087,7 +1087,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..a88707a0f4da 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..09d0c89a9daf 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220887..44df24b62fef 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d5da6f624142..10ad87ba02e0 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -187,7 +187,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..427248de9da5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -79,12 +79,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
@@ -1612,33 +1606,25 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1649,11 +1635,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1744,6 +1725,7 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
@@ -2348,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
@@ -2535,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..9e0938bdf85d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -454,9 +454,10 @@ struct mem_size_stats {
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? HPAGE_PMD_NR : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
@@ -465,23 +466,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
+ }
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
}
}
@@ -516,7 +531,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -532,8 +548,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -549,7 +564,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -838,7 +853,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -1112,7 +1127,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
@@ -1444,7 +1459,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index b1bc954eccf3..0f3e16b1ea64 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -260,7 +260,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
return NULL;
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, flag);
return cpu_addr;
}
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 14b0ff32fb9f..63abda1ac06d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -207,11 +207,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-#endif
-
#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -619,10 +614,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
{
return 0;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return 0;
-}
#ifndef __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index a8a335b7fce0..758a029011b1 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -197,6 +197,16 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro
int configfs_register_subsystem(struct configfs_subsystem *subsys);
void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+int configfs_register_group(struct config_group *parent_group,
+ struct config_group *group);
+void configfs_unregister_group(struct config_group *group);
+
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+ const char *name,
+ struct config_item_type *item_type);
+void configfs_unregister_default_group(struct config_group *group);
+
/* These functions can sleep and can alloc with GFP_KERNEL */
/* WARNING: These cannot be called underneath configfs callbacks!! */
int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target);
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index fe8cb610deac..e5f539dd56bf 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -51,7 +51,8 @@ extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ gfp_t flags);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
@@ -132,7 +133,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ gfp_t flags)
{
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ecb080d6ff42..72cd942edb22 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -25,7 +25,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -48,17 +48,6 @@ enum transparent_hugepage_flag {
#endif
};
-enum page_check_address_pmd_flag {
- PAGE_CHECK_ADDRESS_PMD_FLAG,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
-extern pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl);
-
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -95,30 +84,27 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
#endif /* CONFIG_DEBUG_VM */
extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+
+int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list(page, NULL);
}
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd) \
+void deferred_split_huge_page(struct page *page);
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address);
+
+#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
- if (unlikely(pmd_trans_huge(*____pmd))) \
- __split_huge_page_pmd(__vma, __address, \
- ____pmd); \
+ if (pmd_trans_huge(*____pmd)) \
+ __split_huge_pmd(__vma, __pmd, __address); \
} while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { \
- pmd_t *____pmd = (__pmd); \
- anon_vma_lock_write(__anon_vma); \
- anon_vma_unlock_write(__anon_vma); \
- BUG_ON(pmd_trans_splitting(*____pmd) || \
- pmd_trans_huge(*____pmd)); \
- } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd);
+
#if HPAGE_PMD_ORDER >= MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
@@ -128,17 +114,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl);
/* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (pmd_trans_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma, ptl);
else
- return 0;
+ return false;
}
static inline int hpage_nr_pages(struct page *page)
{
@@ -183,11 +169,8 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-#define split_huge_page_pmd(__vma, __address, __pmd) \
- do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+static inline void deferred_split_huge_page(struct page *page) {}
+#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
@@ -201,10 +184,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
- return 0;
+ return false;
}
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/iio/sw_trigger.h b/include/linux/iio/sw_trigger.h
new file mode 100644
index 000000000000..c2f33b2b35a5
--- /dev/null
+++ b/include/linux/iio/sw_trigger.h
@@ -0,0 +1,71 @@
+/*
+ * Industrial I/O software trigger interface
+ *
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef __IIO_SW_TRIGGER
+#define __IIO_SW_TRIGGER
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/iio/iio.h>
+#include <linux/configfs.h>
+
+#define module_iio_sw_trigger_driver(__iio_sw_trigger_type) \
+ module_driver(__iio_sw_trigger_type, iio_register_sw_trigger_type, \
+ iio_unregister_sw_trigger_type)
+
+extern struct configfs_subsystem iio_configfs_subsys;
+struct iio_sw_trigger_ops;
+
+struct iio_sw_trigger_type {
+ const char *name;
+ struct module *owner;
+ const struct iio_sw_trigger_ops *ops;
+ struct list_head list;
+ struct config_group *group;
+};
+
+struct iio_sw_trigger {
+ struct iio_trigger *trigger;
+ struct iio_sw_trigger_type *trigger_type;
+ struct config_group group;
+};
+
+struct iio_sw_trigger_ops {
+ struct iio_sw_trigger* (*probe)(const char *);
+ int (*remove)(struct iio_sw_trigger *);
+};
+
+static inline
+struct iio_sw_trigger *to_iio_sw_trigger(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct iio_sw_trigger,
+ group);
+}
+
+int iio_register_sw_trigger_type(struct iio_sw_trigger_type *tt);
+void iio_unregister_sw_trigger_type(struct iio_sw_trigger_type *tt);
+
+struct iio_sw_trigger *iio_sw_trigger_create(const char *, const char *);
+void iio_sw_trigger_destroy(struct iio_sw_trigger *);
+
+int iio_sw_trigger_type_configfs_register(struct iio_sw_trigger_type *tt);
+void iio_sw_trigger_type_configfs_unregister(struct iio_sw_trigger_type *tt);
+
+static inline
+void iio_swt_group_init_type_name(struct iio_sw_trigger *t,
+ const char *name,
+ struct config_item_type *type)
+{
+#ifdef CONFIG_CONFIGFS_FS
+ config_group_init_type_name(&t->group, name, type);
+#endif
+}
+
+#endif /* __IIO_SW_TRIGGER */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d140b1e9faa7..2da38f093391 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cd0e2413c358..ffc5460ed9e5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -294,10 +294,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp);
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound);
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+ bool lrucare, bool compound);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -513,7 +515,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask,
- struct mem_cgroup **memcgp)
+ struct mem_cgroup **memcgp,
+ bool compound)
{
*memcgp = NULL;
return 0;
@@ -521,12 +524,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
static inline void mem_cgroup_commit_charge(struct page *page,
struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
}
static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg,
+ bool compound)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad7793788..fc9a3b8335bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -395,39 +395,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
extern void kvfree(const void *addr);
-static inline void compound_lock(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline void compound_unlock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline unsigned long compound_lock_irqsave(struct page *page)
-{
- unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- local_irq_save(flags);
- compound_lock(page);
-#endif
- return flags;
+ return &page[1].compound_mapcount;
}
-static inline void compound_unlock_irqrestore(struct page *page,
- unsigned long flags)
+static inline int compound_mapcount(struct page *page)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- compound_unlock(page);
- local_irq_restore(flags);
-#endif
+ if (!PageCompound(page))
+ return 0;
+ page = compound_head(page);
+ return atomic_read(compound_mapcount_ptr(page)) + 1;
}
/*
@@ -440,55 +418,34 @@ static inline void page_mapcount_reset(struct page *page)
atomic_set(&(page)->_mapcount, -1);
}
+int __page_mapcount(struct page *page);
+
static inline int page_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(PageSlab(page), page);
- return atomic_read(&page->_mapcount) + 1;
-}
-
-static inline int page_count(struct page *page)
-{
- return atomic_read(&compound_head(page)->_count);
-}
-static inline bool __compound_tail_refcounted(struct page *page)
-{
- return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
+ if (unlikely(PageCompound(page)))
+ return __page_mapcount(page);
+ return atomic_read(&page->_mapcount) + 1;
}
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int total_mapcount(struct page *page);
+#else
+static inline int total_mapcount(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- return __compound_tail_refcounted(page);
+ return page_mapcount(page);
}
+#endif
-static inline void get_huge_page_tail(struct page *page)
+static inline int page_count(struct page *page)
{
- /*
- * __split_huge_page_refcount() cannot run from under us.
- */
- VM_BUG_ON_PAGE(!PageTail(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- if (compound_tail_refcounted(compound_head(page)))
- atomic_inc(&page->_mapcount);
+ return atomic_read(&compound_head(page)->_count);
}
-extern bool __get_page_tail(struct page *page);
-
static inline void get_page(struct page *page)
{
- if (unlikely(PageTail(page)))
- if (likely(__get_page_tail(page)))
- return;
+ page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
@@ -513,7 +470,15 @@ static inline void init_page_count(struct page *page)
atomic_set(&page->_count, 1);
}
-void put_page(struct page *page);
+void __put_page(struct page *page);
+
+static inline void put_page(struct page *page)
+{
+ page = compound_head(page);
+ if (put_page_testzero(page))
+ __put_page(page);
+}
+
void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
@@ -533,6 +498,9 @@ enum compound_dtor_id {
#ifdef CONFIG_HUGETLB_PAGE
HUGETLB_PAGE_DTOR,
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ TRANSHUGE_PAGE_DTOR,
+#endif
NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[];
@@ -562,6 +530,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
page[1].compound_order = order;
}
+void free_compound_page(struct page *page);
+
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
@@ -978,10 +948,21 @@ static inline pgoff_t page_file_index(struct page *page)
/*
* Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
*/
-static inline int page_mapped(struct page *page)
-{
- return atomic_read(&(page)->_mapcount) >= 0;
+static inline bool page_mapped(struct page *page)
+{
+ int i;
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
}
/*
@@ -2222,7 +2203,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
extern int get_hwpoison_page(struct page *page);
-extern void put_hwpoison_page(struct page *page);
+#define put_hwpoison_page(page) put_page(page)
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492a114f..5e37e918c614 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,8 @@ struct page {
* see PAGE_MAPPING_ANON below.
*/
void *s_mem; /* slab first object */
+ atomic_t compound_mapcount; /* first tail page */
+ /* page_deferred_list().next -- second tail page */
};
/* Second double word */
@@ -61,6 +63,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+ /* page_deferred_list().prev -- second tail page */
};
union {
@@ -81,20 +84,9 @@ struct page {
union {
/*
- * Count of ptes mapped in
- * mms, to show when page is
- * mapped & limit reverse map
- * searches.
- *
- * Used also for tail pages
- * refcounting instead of
- * _count. Tail pages cannot
- * be mapped and keeping the
- * tail page _count zero at
- * all times guarantees
- * get_page_unless_zero() will
- * never succeed on tail
- * pages.
+ * Count of ptes mapped in mms, to show
+ * when page is mapped & limit reverse
+ * map searches.
*/
atomic_t _mapcount;
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 877ef226f90f..c447d8055e50 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -55,4 +55,10 @@ void dump_mm(const struct mm_struct *mm);
#define VIRTUAL_BUG_ON(cond) do { } while (0)
#endif
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
+
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bb53c7b86315..dab6cff11e18 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,9 +101,6 @@ enum pageflags {
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- PG_compound_lock,
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
@@ -129,53 +126,108 @@ enum pageflags {
/* SLOB */
PG_slob_free = PG_private,
+
+ /* Compound pages. Stored in first tail page's flags */
+ PG_double_map = PG_private_2,
};
#ifndef __GENERATING_BOUNDS_H
+struct page; /* forward declaration */
+
+static inline struct page *compound_head(struct page *page)
+{
+ unsigned long head = READ_ONCE(page->compound_head);
+
+ if (unlikely(head & 1))
+ return (struct page *) (head - 1);
+ return page;
+}
+
+static inline int PageTail(struct page *page)
+{
+ return READ_ONCE(page->compound_head) & 1;
+}
+
+static inline int PageCompound(struct page *page)
+{
+ return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ * the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ * for compound page all operations related to the page flag applied to
+ * head page.
+ *
+ * PF_NO_TAIL:
+ * modifications of the page flag must be done on small or head pages,
+ * checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ * the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
+ compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -204,47 +256,56 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
+ TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -257,31 +318,34 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
@@ -289,10 +353,10 @@ PAGEFLAG_FALSE(HWPoison)
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
-SETPAGEFLAG(Young, young)
-TESTCLEARFLAG(Young, young)
-PAGEFLAG(Idle, idle)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
#endif
/*
@@ -317,6 +381,7 @@ PAGEFLAG(Idle, idle)
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
@@ -329,6 +394,7 @@ static inline int PageAnon(struct page *page)
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
@@ -340,8 +406,9 @@ u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -358,22 +425,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -393,12 +462,7 @@ static inline void set_page_writeback_keepwrite(struct page *page)
test_set_page_writeback_keepwrite(page);
}
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-
-static inline int PageTail(struct page *page)
-{
- return READ_ONCE(page->compound_head) & 1;
-}
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
static inline void set_compound_head(struct page *page, struct page *head)
{
@@ -410,20 +474,6 @@ static inline void clear_compound_head(struct page *page)
WRITE_ONCE(page->compound_head, 0);
}
-static inline struct page *compound_head(struct page *page)
-{
- unsigned long head = READ_ONCE(page->compound_head);
-
- if (unlikely(head & 1))
- return (struct page *) (head - 1);
- return page;
-}
-
-static inline int PageCompound(struct page *page)
-{
- return PageHead(page) || PageTail(page);
-
-}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
@@ -484,22 +534,43 @@ static inline int PageTransTail(struct page *page)
return PageTail(page);
}
-#else
-
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
+ * as PMDs.
+ *
+ * This is required for optimization of rmap operations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
{
- return 0;
+ return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_set_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_clear_bit(PG_double_map, &page[1].flags);
}
+
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+ TESTSETFLAG_FALSE(DoubleMap)
+ TESTCLEARFLAG_FALSE(DoubleMap)
#endif
/*
@@ -583,12 +654,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
#define __PG_MLOCKED 0
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK 0
-#endif
-
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
@@ -598,8 +663,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
1 << PG_private | 1 << PG_private_2 | \
1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED | \
- __PG_COMPOUND_LOCK)
+ 1 << PG_unevictable | __PG_MLOCKED)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -626,6 +690,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 26eabf5ec718..4d08b6c33557 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
+ pgoff_t pgoff;
+
if (unlikely(PageHeadHuge(page)))
return page->index << compound_order(page);
- else
+
+ if (likely(!PageTransTail(page)))
return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ /*
+ * We don't initialize ->index for tail pages: calculate based on
+ * head page
+ */
+ pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff += page - compound_head(page);
+ return pgoff;
}
/*
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 317e16de09e5..deabe23b426d 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -27,11 +27,15 @@
* Magic number "tsta" to indicate a static timer initializer
* for the object debugging code.
*/
-#define TIMER_ENTRY_STATIC ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
@@ -73,6 +77,9 @@
#define MUTEX_DEBUG_INIT 0x11
#define MUTEX_DEBUG_FREE 0x22
+/********** lib/dma_debug.c **********/
+#define DMA_ALLOC_POISON 0xee
+
/********** lib/flex_array.c **********/
#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 29446aeef36e..ebf3750e42b2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -161,25 +161,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
struct anon_vma *page_get_anon_vma(struct page *page);
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
+
/*
* rmap interfaces called when adding or removing pte of page
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
{
- atomic_inc(&page->_mapcount);
+ atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}
/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7c82e3b307a3..96940772bb92 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -158,6 +158,24 @@ size_t ksize(const void *);
#endif
/*
+ * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
+ * Intended for arches that get misalignment faults even for 64 bit integer
+ * aligned buffers.
+ */
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+/*
+ * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
+ * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
+ * aligned pointers.
+ */
+#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
+#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
+#define __assume_page_alignment __assume_aligned(PAGE_SIZE)
+
+/*
* Kmalloc array related definitions
*/
@@ -286,8 +304,8 @@ static __always_inline int kmalloc_index(size_t size)
}
#endif /* !CONFIG_SLOB */
-void *__kmalloc(size_t size, gfp_t flags);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
void kmem_cache_free(struct kmem_cache *, void *);
/*
@@ -301,8 +319,8 @@ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -316,12 +334,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
#endif
#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node, size_t size);
+ int node, size_t size) __assume_slab_alignment;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -354,10 +372,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
@@ -482,15 +500,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}
-/*
- * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
- * Intended for arches that get misalignment faults even for 64 bit integer
- * aligned buffers.
- */
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
struct memcg_cache_array {
struct rcu_head rcu;
struct kmem_cache *entries[0];
diff --git a/include/linux/string.h b/include/linux/string.h
index 9ef7795e65e4..cc77d6477fc2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -121,6 +121,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7ba7dccaf0e7..457181844b6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -154,7 +154,7 @@ enum {
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
};
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
@@ -539,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-#define reuse_swap_page(page) (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+ (!PageTransCompound(page) && page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e623d392db0c..e1f8c993e73b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -68,7 +68,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_FAULT_FALLBACK,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
- THP_SPLIT,
+ THP_SPLIT_PAGE,
+ THP_SPLIT_PAGE_FAILED,
+ THP_SPLIT_PMD,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
new file mode 100644
index 000000000000..11c59ca5e145
--- /dev/null
+++ b/include/trace/events/huge_memory.h
@@ -0,0 +1,166 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM huge_memory
+
+#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HUGE_MEMORY_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/events/gfpflags.h>
+
+#define SCAN_STATUS \
+ EM( SCAN_FAIL, "failed") \
+ EM( SCAN_SUCCEED, "succeeded") \
+ EM( SCAN_PMD_NULL, "pmd_null") \
+ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
+ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
+ EM( SCAN_PAGE_RO, "no_writable_page") \
+ EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \
+ EM( SCAN_PAGE_NULL, "page_null") \
+ EM( SCAN_SCAN_ABORT, "scan_aborted") \
+ EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
+ EM( SCAN_PAGE_LRU, "page_not_in_lru") \
+ EM( SCAN_PAGE_LOCK, "page_locked") \
+ EM( SCAN_PAGE_ANON, "page_not_anon") \
+ EM( SCAN_PAGE_COMPOUND, "page_compound") \
+ EM( SCAN_ANY_PROCESS, "no_process_for_page") \
+ EM( SCAN_VMA_NULL, "vma_null") \
+ EM( SCAN_VMA_CHECK, "vma_check_failed") \
+ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
+ EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
+ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
+ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte")
+
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+SCAN_STATUS
+
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(mm_khugepaged_scan_pmd,
+
+ TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+ bool referenced, int none_or_zero, int status, int unmapped),
+
+ TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status, unmapped),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, pfn)
+ __field(bool, writable)
+ __field(bool, referenced)
+ __field(int, none_or_zero)
+ __field(int, status)
+ __field(int, unmapped)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->pfn = pfn;
+ __entry->writable = writable;
+ __entry->referenced = referenced;
+ __entry->none_or_zero = none_or_zero;
+ __entry->status = status;
+ __entry->unmapped = unmapped;
+ ),
+
+ TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
+ __entry->mm,
+ __entry->pfn,
+ __entry->writable,
+ __entry->referenced,
+ __entry->none_or_zero,
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->unmapped)
+);
+
+TRACE_EVENT(mm_collapse_huge_page,
+
+ TP_PROTO(struct mm_struct *mm, int isolated, int status),
+
+ TP_ARGS(mm, isolated, status),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, isolated)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->isolated = isolated;
+ __entry->status = status;
+ ),
+
+ TP_printk("mm=%p, isolated=%d, status=%s",
+ __entry->mm,
+ __entry->isolated,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_isolate,
+
+ TP_PROTO(unsigned long pfn, int none_or_zero,
+ bool referenced, bool writable, int status),
+
+ TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(int, none_or_zero)
+ __field(bool, referenced)
+ __field(bool, writable)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->none_or_zero = none_or_zero;
+ __entry->referenced = referenced;
+ __entry->writable = writable;
+ __entry->status = status;
+ ),
+
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
+ __entry->pfn,
+ __entry->none_or_zero,
+ __entry->referenced,
+ __entry->writable,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+ TP_ARGS(mm, swapped_in, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swapped_in)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swapped_in = swapped_in;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swapped_in=%d, ret=%d",
+ __entry->mm,
+ __entry->swapped_in,
+ __entry->ret)
+);
+
+#endif /* __HUGE_MEMORY_H */
+#include <trace/define_trace.h>
+
diff --git a/ipc/msg.c b/ipc/msg.c
index 1471db9a7e61..59559a215401 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4e5e9798aa0c..51373997f479 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..b29add22c454 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -519,46 +520,9 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ mapping = compound_head(page)->mapping;
+ if (!mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
@@ -588,7 +553,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -603,15 +568,15 @@ again:
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
diff --git a/lib/Kconfig b/lib/Kconfig
index f0df318104e7..f7e64c7748cb 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 749e886a94b6..526105c18566 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
If unsure, say N.
+config DEBUG_VM_PGFLAGS
+ bool "Debug page-flags operations"
+ depends on DEBUG_VM
+ help
+ Enables extra validation on page flags operations.
+
+ If unsure, say N.
+
config DEBUG_VIRTUAL
bool "Debug VM translations"
depends on DEBUG_KERNEL && X86
@@ -1765,6 +1773,16 @@ config DMA_API_DEBUG
If unsure, say N.
+config DMA_API_DEBUG_POISON
+ bool "Poison coherent DMA buffers"
+ depends on DMA_API_DEBUG && EXPERT
+ help
+ Poison DMA buffers returned by dma_alloc_coherent unless __GFP_ZERO
+ is explicitly specified, to catch drivers depending on zeroed buffers
+ without passing the correct flags.
+
+ Only say Y if you're prepared for almost everything to break.
+
config TEST_LKM
tristate "Test module loading with 'hello world' module"
default n
diff --git a/lib/Makefile b/lib/Makefile
index d82cb5bac5bc..5f4a9e8c1779 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 8855f019ebe8..af6262b4e02c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/ctype.h>
#include <linux/list.h>
+#include <linux/poison.h>
#include <linux/slab.h>
#include <asm/sections.h>
@@ -1447,7 +1448,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
EXPORT_SYMBOL(debug_dma_unmap_sg);
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt, gfp_t flags)
{
struct dma_debug_entry *entry;
@@ -1457,6 +1458,9 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
if (unlikely(virt == NULL))
return;
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG_POISON) && !(flags & __GFP_ZERO))
+ memset(virt, DMA_ALLOC_POISON, size);
+
entry = dma_entry_alloc();
if (!entry)
return;
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5939f63d90cd..5c88204b6f1f 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -43,50 +43,73 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
[STRING_UNITS_10] = 1000,
[STRING_UNITS_2] = 1024,
};
- int i, j;
- u32 remainder = 0, sf_cap, exp;
+ static const unsigned int rounding[] = { 500, 50, 5 };
+ int i = 0, j;
+ u32 remainder = 0, sf_cap;
char tmp[8];
const char *unit;
tmp[0] = '\0';
- i = 0;
- if (!size)
+
+ if (blk_size == 0)
+ size = 0;
+ if (size == 0)
goto out;
- while (blk_size >= divisor[units]) {
- remainder = do_div(blk_size, divisor[units]);
+ /* This is Napier's algorithm. Reduce the original block size to
+ *
+ * coefficient * divisor[units]^i
+ *
+ * we do the reduction so both coefficients are just under 32 bits so
+ * that multiplying them together won't overflow 64 bits and we keep
+ * as much precision as possible in the numbers.
+ *
+ * Note: it's safe to throw away the remainders here because all the
+ * precision is in the coefficients.
+ */
+ while (blk_size >> 32) {
+ do_div(blk_size, divisor[units]);
i++;
}
- exp = divisor[units] / (u32)blk_size;
- /*
- * size must be strictly greater than exp here to ensure that remainder
- * is greater than divisor[units] coming out of the if below.
- */
- if (size > exp) {
- remainder = do_div(size, divisor[units]);
- remainder *= blk_size;
+ while (size >> 32) {
+ do_div(size, divisor[units]);
i++;
- } else {
- remainder *= size;
}
+ /* now perform the actual multiplication keeping i as the sum of the
+ * two logarithms */
size *= blk_size;
- size += remainder / divisor[units];
- remainder %= divisor[units];
+ /* and logarithmically reduce it until it's just under the divisor */
while (size >= divisor[units]) {
remainder = do_div(size, divisor[units]);
i++;
}
+ /* work out in j how many digits of precision we need from the
+ * remainder */
sf_cap = size;
for (j = 0; sf_cap*10 < 1000; j++)
sf_cap *= 10;
- if (j) {
+ if (units == STRING_UNITS_2) {
+ /* express the remainder as a decimal. It's currently the
+ * numerator of a fraction whose denominator is
+ * divisor[units], which is 1 << 10 for STRING_UNITS_2 */
remainder *= 1000;
- remainder /= divisor[units];
+ remainder >>= 10;
+ }
+
+ /* add a 5 to the digit below what will be printed to ensure
+ * an arithmetical round up and carry it through to size */
+ remainder += rounding[j];
+ if (remainder >= 1000) {
+ remainder -= 1000;
+ size += 1;
+ }
+
+ if (j) {
snprintf(tmp, sizeof(tmp), ".%03u", remainder);
tmp[j+1] = '\0';
}
diff --git a/mm/debug.c b/mm/debug.c
index 668aa35191ca..836276586185 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags)
{
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
+ if (PageCompound(page))
+ pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+ pr_cont("\n");
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
if (reason)
diff --git a/mm/filemap.c b/mm/filemap.c
index 1bb007624b53..834cd1425307 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- BUG_ON(page_mapped(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg);
+ gfp_mask, &memcg, false);
if (error)
return error;
}
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
@@ -653,7 +653,7 @@ err_insert:
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return error;
}
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c91b36..e95b0cb6ed81 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -116,8 +116,21 @@ retry:
}
}
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +143,10 @@ retry:
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +237,38 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -896,7 +924,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
-
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1063,6 @@ struct page *get_dump_page(unsigned long addr)
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1090,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1102,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
@@ -1119,7 +1145,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
@@ -1128,7 +1154,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1149,24 +1174,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
@@ -1175,7 +1189,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1196,12 +1209,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1210,7 +1217,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
@@ -1218,7 +1225,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1239,12 +1245,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1259,7 +1259,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c29ddebc8705..6834b39a7114 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -26,11 +26,41 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_NO_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -67,6 +97,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -106,6 +137,10 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
{
@@ -553,6 +588,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
@@ -561,6 +623,7 @@ static struct attribute *khugepaged_attr[] = {
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -638,6 +701,9 @@ static int __init hugepage_init(void)
err = register_shrinker(&huge_zero_page_shrinker);
if (err)
goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -655,6 +721,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
khugepaged_slab_exit();
@@ -711,6 +779,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
return entry;
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * ->lru in the tail pages is occupied by compound_head.
+ * Let's use ->mapping + ->index in the second tail page as list_head.
+ */
+ return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -724,7 +813,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -732,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return VM_FAULT_OOM;
}
@@ -748,7 +837,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
} else {
@@ -759,7 +848,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
int ret;
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
@@ -770,8 +859,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -865,6 +954,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
+ prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
}
@@ -956,19 +1046,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (unlikely(pmd_trans_splitting(pmd))) {
- /* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- pte_free(dst_mm, pgtable);
-
- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
- goto out;
- }
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
get_page(src_page);
- page_dup_rmap(src_page);
+ page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1008,37 +1089,6 @@ unlock:
spin_unlock(ptl);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
- }
-}
-
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
- put_page(page);
- }
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -1068,13 +1118,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg))) {
+ &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ false);
put_page(pages[i]);
}
kfree(pages);
@@ -1112,8 +1163,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr);
- mem_cgroup_commit_charge(pages[i], memcg, false);
+ page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
@@ -1124,7 +1175,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1141,7 +1192,7 @@ out_free_pages:
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
@@ -1171,7 +1222,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- if (page_mapcount(page) == 1) {
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part. We can do it by checking page_mapcount() on each sub-page, but
+ * it's expensive.
+ * The cheaper way is to check page_count() to be equal 1: every
+ * mapcount takes page reference reference, so this way we can
+ * guarantee, that the PMD is the only mapping.
+ * This can give false negative if somebody pinned the page, but that's
+ * fine.
+ */
+ if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1180,7 +1241,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
+ get_page(page);
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
@@ -1190,30 +1251,33 @@ alloc:
} else
new_page = NULL;
- if (unlikely(!new_page)) {
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
if (!page) {
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) {
- split_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
- put_user_huge_page(page);
+ put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+ true))) {
put_page(new_page);
if (page) {
- split_huge_page(page);
- put_user_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
+ put_page(page);
} else
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1233,10 +1297,10 @@ alloc:
spin_lock(ptl);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
@@ -1244,8 +1308,8 @@ alloc:
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- page_add_new_anon_rmap(new_page, vma, haddr);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1254,7 +1318,7 @@ alloc:
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
@@ -1308,7 +1372,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ */
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
@@ -1318,7 +1395,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
@@ -1459,7 +1536,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
spinlock_t *ptl;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
@@ -1481,7 +1558,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1493,13 +1570,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
- int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
@@ -1508,7 +1584,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
@@ -1516,15 +1592,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
- goto out;
+ return false;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
- if (ret == 1) {
+ if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1540,9 +1615,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ return true;
}
-out:
- return ret;
+ return false;
}
/*
@@ -1558,7 +1633,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
@@ -1589,405 +1664,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
*
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
*ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- return -1;
- } else {
- /* Thp mapped by 'pmd' is stable, so we can
- * handle it as it is. */
- return 1;
- }
- }
- spin_unlock(*ptl);
- return 0;
-}
-
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- /*
- * split_vma() may create temporary aliased mappings. There is
- * no risk as long as all huge pmd are found and have their
- * splitting bit set before __split_huge_page_refcount
- * runs. Finding the same huge pmd more than once during the
- * same rmap walk is not a problem.
- */
- if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd))
- goto unlock;
- if (pmd_trans_huge(*pmd)) {
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
- !pmd_trans_splitting(*pmd));
- return pmd;
- }
-unlock:
+ if (likely(pmd_trans_huge(*pmd)))
+ return true;
spin_unlock(*ptl);
- return NULL;
-}
-
-static int __split_huge_page_splitting(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd;
- int ret = 0;
- /* For mmu_notifiers */
- const unsigned long mmun_start = address;
- const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
- if (pmd) {
- /*
- * We can't temporarily set the pmd to null in order
- * to split it, the pmd must remain marked huge at all
- * times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->rwsem to
- * serialize against split_huge_page*.
- */
- pmdp_splitting_flush(vma, address, pmd);
-
- ret = 1;
- spin_unlock(ptl);
- }
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
-{
- int i;
- struct zone *zone = page_zone(page);
- struct lruvec *lruvec;
- int tail_count = 0;
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
-
- compound_lock(page);
- /* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(page);
-
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
- struct page *page_tail = page + i;
-
- /* tail_page->_mapcount cannot change */
- BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
- /*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
- */
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
-
- /* after clearing PageTail the gup refcount can be released */
- smp_mb__after_atomic();
-
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
-
- clear_compound_head(page_tail);
-
- if (page_is_young(page))
- set_page_young(page_tail);
- if (page_is_idle(page))
- set_page_idle(page_tail);
-
- /*
- * __split_huge_page_splitting() already set the
- * splitting bit in all pmd that could map this
- * hugepage, that will ensure no CPU can alter the
- * mapcount on the head page. The mapcount is only
- * accounted in the head page and it has to be
- * transferred to all tail pages in the below code. So
- * for this code to be safe, the split the mapcount
- * can't change. But that doesn't mean userland can't
- * keep changing and reading the page contents while
- * we transfer the mapcount, so the pmd splitting
- * status is achieved setting a reserved bit in the
- * pmd, not by clearing the present bit.
- */
- page_tail->_mapcount = page->_mapcount;
-
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
-
- page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(page, page_tail, lruvec, list);
- }
- atomic_sub(tail_count, &page->_count);
- BUG_ON(atomic_read(&page->_count) <= 0);
-
- __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
- ClearPageCompound(page);
- compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
- }
-
- /*
- * Only the head page (now become a regular page) is required
- * to be pinned by the caller.
- */
- BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd, _pmd;
- int ret = 0, i;
- pgtable_t pgtable;
- unsigned long haddr;
-
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
- if (pmd_write(*pmd))
- BUG_ON(page_mapcount(page) != 1);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
-
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_pmd_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
- spin_unlock(ptl);
- }
-
- return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
-{
- int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
-
- BUG_ON(!PageHead(page));
- BUG_ON(PageTail(page));
-
- mapcount = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount += __split_huge_page_splitting(page, vma, addr);
- }
- /*
- * It is critical that new vmas are added to the tail of the
- * anon_vma list. This guarantes that if copy_huge_pmd() runs
- * and establishes a child pmd before
- * __split_huge_page_splitting() freezes the parent pmd (so if
- * we fail to prevent copy_huge_pmd() from running until the
- * whole __split_huge_page() is complete), we will still see
- * the newly established pmd of the child later during the
- * walk, to be able to set it as pmd_trans_splitting too.
- */
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
-
- __split_huge_page_refcount(page, list);
-
- mapcount2 = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount2 += __split_huge_page_map(page, vma, addr);
- }
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
- struct anon_vma *anon_vma;
- int ret = 1;
-
- BUG_ON(is_huge_zero_page(page));
- BUG_ON(!PageAnon(page));
-
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(page);
- if (!anon_vma)
- goto out;
- anon_vma_lock_write(anon_vma);
-
- ret = 0;
- if (!PageCompound(page))
- goto out_unlock;
-
- BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
- count_vm_event(THP_SPLIT);
-
- BUG_ON(PageCompound(page));
-out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
-out:
- return ret;
+ return false;
}
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2198,26 +1887,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2229,8 +1925,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
@@ -2239,6 +1937,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ result = SCAN_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
@@ -2246,6 +1945,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
@@ -2260,6 +1960,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -2273,10 +1974,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
- return 1;
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
out:
release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
return 0;
}
@@ -2321,7 +2033,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page);
+ page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -2431,6 +2143,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
return NULL;
}
+ prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
@@ -2442,8 +2155,12 @@ static int khugepaged_find_target_node(void)
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2493,7 +2210,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
-
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
@@ -2502,6 +2218,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return true;
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long _address;
+ pte_t *pte, pteval;
+ int swapped_in = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
@@ -2513,7 +2267,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated = 0, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2528,12 +2282,15 @@ static void collapse_huge_page(struct mm_struct *mm,
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
- if (!new_page)
- return;
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
- return;
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
/*
* Prevent all access to pagetables with the exception of
@@ -2541,21 +2298,33 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ result = SCAN_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ result = SCAN_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ result = SCAN_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ result = SCAN_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
+
+ __collapse_huge_page_swapin(mm, vma, address, pmd);
anon_vma_lock_write(vma->anon_vma);
@@ -2592,6 +2361,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
goto out;
}
@@ -2618,8 +2388,8 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -2629,12 +2399,14 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL;
khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
return;
-
out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -2645,39 +2417,62 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
- struct page *page;
+ int ret = 0, none_or_zero = 0, result = 0;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2685,26 +2480,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (referenced && writable)
- ret = 1;
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_NO_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
@@ -2713,6 +2530,8 @@ out_unmap:
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+ none_or_zero, result, unmapped);
return ret;
}
@@ -2938,8 +2757,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2958,66 +2777,130 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
put_huge_zero_page();
}
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
{
- spinlock_t *ptl;
- struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ bool young, write;
+ int i;
- BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!pmd_trans_huge(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd)))
- goto unlock;
if (vma_is_dax(vma)) {
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ return;
} else if (is_huge_zero_pmd(*pmd)) {
- __split_huge_zero_page_pmd(vma, haddr, pmd);
- } else {
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- unlock:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (!page)
- return;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ write = pmd_write(*pmd);
+ young = pmd_young(*pmd);
- split_huge_page(page);
- put_page(page);
+ /* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
/*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
*/
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
}
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address)
{
- struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
- vma = find_vma(mm, address);
- BUG_ON(vma == NULL);
- split_huge_page_pmd(vma, address, pmd);
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto out;
+ page = pmd_page(*pmd);
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
@@ -3026,7 +2909,7 @@ static void split_huge_page_address(struct mm_struct *mm,
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
+ pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -3035,13 +2918,13 @@ static void split_huge_page_address(struct mm_struct *mm,
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd_mm(mm, address, pmd);
+ split_huge_pmd(vma, pmd, address);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3057,7 +2940,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, start);
+ split_huge_pmd_address(vma, start);
/*
* If the new end address isn't hpage aligned and it could
@@ -3067,7 +2950,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, end);
+ split_huge_pmd_address(vma, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3081,6 +2964,427 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_page_address(next->vm_mm, nstart);
+ split_huge_pmd_address(next, nstart);
+ }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+ pmd = pmd_offset(pud, address);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return;
+ }
+ if (pmd_trans_huge(*pmd)) {
+ if (page == pmd_page(*pmd))
+ __split_huge_pmd_locked(vma, pmd, address, true);
+ spin_unlock(ptl);
+ return;
+ }
+ spin_unlock(ptl);
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ pte_t entry, swp_pte;
+ swp_entry_t swp_entry;
+
+ if (!pte_present(pte[i]))
+ continue;
+ if (page_to_pfn(page) != pte_pfn(pte[i]))
+ continue;
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = ptep_clear_flush(vma, address, pte + i);
+ swp_entry = make_migration_entry(page, pte_write(entry));
+ swp_pte = swp_entry_to_pte(swp_entry);
+ if (pte_soft_dirty(entry))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+ page_remove_rmap(page, false);
+ put_page(page);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+ pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long haddr;
+
+ haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, haddr);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte, entry;
+ swp_entry_t swp_entry;
+ int i;
+
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ if (!is_swap_pte(pte[i]))
+ continue;
+
+ swp_entry = pte_to_swp_entry(pte[i]);
+ if (!is_migration_entry(swp_entry))
+ continue;
+ if (migration_entry_to_page(swp_entry) != page)
+ continue;
+
+ get_page(page);
+ page_add_anon_rmap(page, vma, address, false);
+
+ entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ entry = pte_mkdirty(entry);
+ if (is_write_migration_entry(swp_entry))
+ entry = maybe_mkwrite(entry, vma);
+
+ flush_dcache_page(page);
+ set_pte_at(vma->vm_mm, address, pte + i, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte + i);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff, pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ unfreeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ int mapcount;
+ struct page *page_tail = head + tail;
+
+ mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+ /*
+ * tail_page->_count is zero and not changing from under us. But
+ * get_page_unless_zero() may be running from under us on the
+ * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is implemented in C not
+ * using locked ops. spin_unlock on x86 sometime uses locked ops
+ * because of PPro errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and not only on x86),
+ * it's safer to use atomic_add().
+ */
+ atomic_add(mapcount + 1, &page_tail->_count);
+
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * After clearing PageTail the gup refcount can be released.
+ * Page flags also must be visible before we make the page non-compound.
+ */
+ smp_wmb();
+
+ clear_compound_head(page_tail);
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+
+ page_tail->index = head->index + tail;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ lru_add_page_tail(head, page_tail, lruvec, list);
+
+ return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct zone *zone = page_zone(head);
+ struct lruvec *lruvec;
+ int i, tail_mapcount;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(head, zone);
+
+ spin_lock(&split_queue_lock);
+ if (!list_empty(page_deferred_list(head))) {
+ split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock(&split_queue_lock);
+
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(head);
+
+ tail_mapcount = 0;
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+ atomic_sub(tail_mapcount, &head->_count);
+
+ ClearPageCompound(head);
+ spin_unlock_irq(&zone->lru_lock);
+
+ unfreeze_page(page_anon_vma(head), head);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
}
}
+
+int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct anon_vma *anon_vma;
+ int count, mapcount, ret;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+
+ /*
+ * Racy check if we can split the page, before freeze_page() will
+ * split PMDs
+ */
+ if (total_mapcount(head) != page_count(head) - 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ freeze_page(anon_vma, head);
+ VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+ count = page_count(head);
+ mapcount = total_mapcount(head);
+ if (mapcount == count - 1) {
+ __split_huge_page(page, list);
+ ret = 0;
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+ BUG();
+ } else {
+ unfreeze_page(anon_vma, head);
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ list_add_tail(page_deferred_list(page), &split_queue);
+ split_queue_len++;
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /*
+ * Split a page from split_queue will free up at least one page,
+ * at most HPAGE_PMD_NR - 1. We don't track exact number.
+ * Let's use HPAGE_PMD_NR / 2 as ballpark.
+ */
+ return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_init(&split_queue, &list);
+
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ /* race with put_compound_page() */
+ if (!get_page_unless_zero(page)) {
+ list_del_init(page_deferred_list(page));
+ split_queue_len--;
+ }
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ lock_page(page);
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_tail(&list, &split_queue);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+};
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 827bb02a43a4..acc8b9ab3a5e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1258,8 +1258,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __SetPageHead(page);
__ClearPageReserved(page);
+ __SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
@@ -3126,7 +3126,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- page_dup_rmap(ptepage);
+ page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
}
@@ -3210,7 +3210,7 @@ again:
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
address += sz;
@@ -3439,7 +3439,7 @@ retry_avoidcopy:
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
@@ -3609,7 +3609,7 @@ retry:
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
} else
- page_dup_rmap(page);
+ page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3889,7 +3889,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 38e24b89e4c4..4ae7b7c7462b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -33,6 +34,10 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -66,50 +71,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
- if (get_page_head)
- atomic_inc(&compound_head(page)->_count);
- get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -309,10 +270,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
- struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+ return address;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/ksm.c b/mm/ksm.c
index b5cd647daa52..30cb0f753e19 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
up_read(&mm->mmap_sem);
}
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
-}
-
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -958,13 +944,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -977,33 +963,6 @@ out:
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -1022,9 +981,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (page == kpage) /* ksm page forked */
return 0;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1037,6 +993,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
@@ -1067,6 +1030,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
}
}
+out_unlock:
unlock_page(page);
out:
return err;
@@ -1639,8 +1603,7 @@ next_mm:
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1903,7 +1866,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9acfb165eb52..fcd7c4e83c87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -696,7 +696,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- int nr_pages)
+ bool compound, int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -709,9 +709,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
+ }
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
@@ -2435,9 +2437,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -4499,38 +4499,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
+ bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
unsigned long flags;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
+ VM_BUG_ON(compound && !PageTransHuge(page));
/*
* Prevent mem_cgroup_replace_page() from looking at
* page->mem_cgroup of its source page while we change it.
*/
+ ret = -EBUSY;
if (!trylock_page(page))
goto out;
@@ -4585,9 +4577,9 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
+ mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -4677,7 +4669,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4861,17 +4853,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
union mc_target target;
struct page *page;
- /*
- * We don't take compound_lock() here but no race with splitting thp
- * happens because:
- * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
- * under splitting, which means there's no concurrent thp split,
- * - if another thread runs into split_huge_page() just after we
- * entered this if-block, the thread must wait for page table lock
- * to be unlocked in __split_huge_page_splitting(), where the main
- * part of thp split is not executed yet.
- */
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
@@ -4880,7 +4862,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
@@ -4909,7 +4891,8 @@ retry:
page = target.page;
if (isolate_lru_page(page))
goto put;
- if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5250,10 +5233,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* with mem_cgroup_cancel_charge() in case page instantiation fails.
*/
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
{
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
@@ -5283,11 +5267,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
}
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5316,9 +5295,9 @@ out:
* Use mem_cgroup_cancel_charge() to cancel the transaction instead.
*/
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5335,13 +5314,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
@@ -5363,9 +5337,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
if (mem_cgroup_disabled())
return;
@@ -5377,11 +5352,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
if (!memcg)
return;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
cancel_charge(memcg, nr_pages);
}
@@ -5627,7 +5597,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -1);
+ mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8424b64711ac..1b99403d76f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
- if (PageHuge(head))
- return get_page_unless_zero(head);
-
- /*
- * Thp tail page has special refcounting rule (refcount of tail pages
- * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
- * directly for tail pages.
- */
- if (PageTransHuge(head)) {
+ if (!PageHuge(head) && PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
page_to_pfn(page));
return 0;
}
-
- if (get_page_unless_zero(head)) {
- if (PageTail(page))
- get_page(page);
- return 1;
- } else {
- return 0;
- }
}
- return get_page_unless_zero(page);
+ return get_page_unless_zero(head);
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
- struct page *head = compound_head(page);
-
- if (PageHuge(head)) {
- put_page(head);
- return;
- }
-
- if (PageTransHuge(head))
- if (page != head)
- put_page(head);
-
- put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
+ lock_page(hpage);
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
if (!PageAnon(hpage))
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
put_hwpoison_page(p);
return -EBUSY;
}
+ unlock_page(hpage);
+ get_hwpoison_page(p);
+ put_hwpoison_page(hpage);
VM_BUG_ON_PAGE(!page_count(p), p);
hpage = compound_head(p);
}
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
* Did it turn free?
*/
ret = __get_any_page(page, pfn, 0);
- if (!PageLRU(page)) {
+ if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1750,21 +1718,28 @@ int soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
return -EBUSY;
}
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
- pr_info("soft offline: %#lx: failed to split THP\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
- }
- }
get_online_mems();
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
+
if (ret > 0) { /* for in-use pages */
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ lock_page(hpage);
+ ret = split_huge_page(hpage);
+ unlock_page(hpage);
+ if (unlikely(ret || PageTransCompound(page) ||
+ !PageAnon(page))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(hpage);
+ return -EBUSY;
+ }
+ get_hwpoison_page(page);
+ put_hwpoison_page(hpage);
+ }
+
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
diff --git a/mm/memory.c b/mm/memory.c
index deb679c31f2a..7f3b9f2769ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -566,7 +566,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -586,18 +585,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
- wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
- } else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ }
spin_unlock(ptl);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -613,8 +608,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- } else
- VM_BUG_ON(pmd_trans_splitting(*pmd));
+ }
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -873,7 +867,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
+ page_dup_rmap(page, false);
if (PageAnon(page))
rss[MM_ANONPAGES]++;
else
@@ -1125,7 +1119,7 @@ again:
mark_page_accessed(page);
rss[MM_FILEPAGES]--;
}
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1204,7 +1198,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -2083,7 +2077,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
cow_user_page(new_page, old_page, address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -2113,8 +2107,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2146,14 +2140,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
@@ -2168,7 +2162,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
- munlock_vma_page(old_page);
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
@@ -2443,7 +2438,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
@@ -2528,7 +2523,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2562,7 +2557,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
- exclusive = 1;
+ exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
@@ -2570,10 +2565,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2608,7 +2603,7 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
@@ -2702,7 +2697,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
@@ -2723,15 +2718,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2742,7 +2737,7 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
goto unlock;
oom_free_page:
@@ -2818,7 +2813,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, false);
} else {
inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
page_add_file_rmap(page);
@@ -2993,7 +2988,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
@@ -3022,7 +3017,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
@@ -3037,7 +3032,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
page_cache_release(new_page);
return ret;
}
@@ -3089,7 +3084,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3191,6 +3186,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
@@ -3366,14 +3367,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- /*
- * If the pmd is splitting, return and retry the
- * the fault. Alternative: wait until the split
- * is done, and goto retry.
- */
- if (pmd_trans_splitting(orig_pmd))
- return 0;
-
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..4a02de4e173f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,31 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
+ int nid, ret;
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
- return 0;
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ } else {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return 0;
+ }
+ }
+ }
+retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -513,6 +530,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
+ if (PageTail(page) && PageAnon(page)) {
+ get_page(page);
+ pte_unmap_unlock(pte, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ /* Failed to split -- skip. */
+ if (ret) {
+ pte = pte_offset_map_lock(walk->mm, pmd,
+ addr, &ptl);
+ continue;
+ }
+ goto retry;
+ }
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, qp->pagelist, flags);
diff --git a/mm/migrate.c b/mm/migrate.c
index 7890d0bb5e23..b1034f9c77e7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
- page_dup_rmap(new);
+ page_dup_rmap(new, true);
} else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr);
+ page_add_anon_rmap(new, vma, addr, false);
else
page_add_file_rmap(new);
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
+ if (unlikely(PageTransHuge(page))) {
+ lock_page(page);
+ rc = split_huge_page(page);
+ unlock_page(page);
+ if (rc)
goto out;
+ }
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
+ prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1820,7 @@ fail_putback:
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
+ page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1831,14 @@ fail_putback:
flush_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
+ page_remove_rmap(new_page, true);
goto fail_putback;
}
mlock_migrate_page(new_page, page);
set_page_memcg(new_page, page_memcg(page));
set_page_memcg(page, NULL);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb37f0c..2a565ed8bb49 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 339d9e0949b6..0147b57f9704 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
/* Serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
@@ -444,7 +449,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
&page_mask);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
lock_page(page);
/*
* Any THP page found by follow_page_mask() may
@@ -477,8 +485,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..9c1445dc8a4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,7 +160,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index c25bc6268e46..9acf51a4f682 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (!new_pmd)
break;
if (pmd_trans_huge(*old_pmd)) {
- int err = 0;
if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
- err = move_huge_pmd(vma, new_vma, old_addr,
+ moved = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (need_rmap_locks)
anon_vma_unlock_write(vma->anon_vma);
+ if (moved) {
+ need_flush = true;
+ continue;
+ }
}
- if (err > 0) {
- need_flush = true;
- continue;
- } else if (!err) {
- split_huge_page_pmd(vma, old_addr, old_pmd);
- }
+ split_huge_pmd(vma, old_pmd, old_addr);
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66639a9..9c0908c97f09 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -229,13 +229,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
-static void free_compound_page(struct page *page);
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
free_huge_page,
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ free_transhuge_page,
+#endif
};
int min_free_kbytes = 1024;
@@ -457,7 +459,7 @@ out:
* This usage means that zero-order pages may not be compound.
*/
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
{
__free_pages_ok(page, compound_order(page));
}
@@ -473,8 +475,10 @@ void prep_compound_page(struct page *page, unsigned int order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -739,7 +743,7 @@ static inline int free_pages_check(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -863,6 +867,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
ret = 0;
goto out;
}
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set", 0);
goto out;
@@ -873,6 +898,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
ret = 0;
out:
+ page->mapping = NULL;
clear_compound_head(page);
return ret;
}
@@ -1336,7 +1362,7 @@ static inline int check_new_page(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd79041484..2c553ba969f8 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -56,24 +56,70 @@ static int page_idle_clear_pte_refs_one(struct page *page,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
bool referenced = false;
- if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (pmd) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+ pmd = pmd_offset(pud, addr);
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
+ goto map_pte;
}
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ spin_unlock(ptl);
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return SWAP_AGAIN;
} else {
- pte = page_check_address(page, mm, addr, &ptl, 0);
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap_unlock(pte, ptl);
- }
+ pmd_t pmde = *pmd;
+
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ return SWAP_AGAIN;
+
+ }
+map_pte:
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return SWAP_AGAIN;
}
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap_unlock(pte, ptl);
+found:
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b853ae..207244489a68 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 7d3db0247983..69261d4c774d 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,18 +139,6 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- /* tlb flush only to serialize against gup-fast */
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..e90b81ff306d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -565,27 +565,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
anon_vma_unlock_read(anon_vma);
}
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address = __vma_address(page, vma);
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
- return address;
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
static void percpu_flush_tlb_batch_pages(void *data)
{
@@ -833,60 +812,105 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int referenced = 0;
struct page_referenced_arg *pra = arg;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
return SWAP_AGAIN;
- if (vma->vm_flags & VM_LOCKED) {
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ goto check_pte;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_trans_huge(*pmd)) {
+ int ret = SWAP_AGAIN;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
+ goto map_pte;
+ }
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ if (vma->vm_flags & VM_LOCKED) {
pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ ret = SWAP_FAIL; /* To break the loop */
+ goto unlock_pmd;
}
- /* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
spin_unlock(ptl);
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return ret;
} else {
- pte_t *pte;
+ pmd_t pmde = *pmd;
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
return SWAP_AGAIN;
+ }
+map_pte:
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return SWAP_AGAIN;
+ }
- if (vma->vm_flags & VM_LOCKED) {
- pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ ptl = pte_lockptr(mm, pmd);
+check_pte:
+ spin_lock(ptl);
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
- }
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
}
+ if (vma->vm_flags & VM_LOCKED) {
+ pte_unmap_unlock(pte, ptl);
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
+
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
+
+found:
if (referenced)
clear_page_idle(page);
if (test_and_clear_page_young(page))
@@ -933,7 +957,7 @@ int page_referenced(struct page *page,
int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
+ .mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
@@ -1122,7 +1146,7 @@ static void __page_check_anon_rmap(struct page *page,
* over the call to page_add_new_anon_rmap.
*/
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page->index != linear_page_index(vma, address));
+ BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
#endif
}
@@ -1131,6 +1155,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1138,9 +1163,9 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- do_page_add_anon_rmap(page, vma, address, 0);
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
@@ -1149,29 +1174,44 @@ void page_add_anon_rmap(struct page *page,
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+ struct vm_area_struct *vma, unsigned long address, int flags)
{
- int first = atomic_inc_and_test(&page->_mapcount);
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (compound) {
+ atomic_t *mapcount;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
if (first) {
+ int nr = compound ? hpage_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (PageTransHuge(page))
+ if (compound) {
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON_PAGE(!PageLocked(page), page);
+
/* address might be in next vma when migration races vma_adjust */
if (first)
- __page_set_anon_rmap(page, vma, address, exclusive);
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -1181,21 +1221,31 @@ void do_page_add_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
+ int nr = compound ? hpage_nr_pages(page) : 1;
+
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
- atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1223,12 +1273,15 @@ static void page_remove_file_rmap(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
goto out;
+ }
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page)))
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
/*
@@ -1245,41 +1298,76 @@ out:
mem_cgroup_end_page_stat(memcg);
}
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * themi are still mapped.
+ */
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ } else {
+ nr = HPAGE_PMD_NR;
+ }
+
+ if (nr) {
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ deferred_split_huge_page(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
{
if (!PageAnon(page)) {
+ VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
page_remove_file_rmap(page);
return;
}
+ if (compound)
+ return page_remove_anon_compound_rmap(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
+ __dec_zone_page_state(page, NR_ANON_PAGES);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1420,7 +1508,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, MM_FILEPAGES);
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
page_cache_release(page);
out_unmap:
@@ -1702,7 +1790,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(&page->_mapcount);
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
@@ -1711,7 +1799,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0);
+ atomic_set(compound_mapcount_ptr(page), 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9187eee4128b..529a7d5083f1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -729,7 +729,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+ false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -752,9 +753,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
if (error) {
if (error != -ENOMEM)
error = 0;
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
} else
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
page_cache_release(page);
@@ -1004,7 +1005,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1137,7 +1138,8 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
@@ -1154,14 +1156,14 @@ repeat:
* "repeat": reading a hole and writing should succeed.
*/
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
delete_from_swap_cache(page);
}
}
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
spin_lock(&info->lock);
info->swapped--;
@@ -1196,11 +1198,12 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1210,10 +1213,10 @@ repeat:
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
goto decused;
}
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
diff --git a/mm/slab.c b/mm/slab.c
index e0819fa96559..6070d4ba5be4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3231,11 +3231,15 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller,
+ bool irq_off_needed)
{
unsigned long save_flags;
void *objp;
+ /* Compiler need to remove irq_off_needed branch statements */
+ BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
@@ -3246,9 +3250,11 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
+ if (irq_off_needed)
+ local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
- local_irq_restore(save_flags);
+ if (irq_off_needed)
+ local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
@@ -3404,7 +3410,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ void *ret = slab_alloc(cachep, flags, _RET_IP_, true);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3413,16 +3419,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
- __kmem_cache_free_bulk(s, size, p);
-}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-
+/* Note that interrupts must be enabled when calling this function. */
bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *x = p[i] = slab_alloc(s, flags, _RET_IP_, false);
+
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ }
+ local_irq_enable();
+ return true;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3432,7 +3445,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;
- ret = slab_alloc(cachep, flags, _RET_IP_);
+ ret = slab_alloc(cachep, flags, _RET_IP_, true);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
@@ -3523,7 +3536,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = slab_alloc(cachep, flags, caller);
+ ret = slab_alloc(cachep, flags, caller, true);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
@@ -3543,32 +3556,56 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
}
EXPORT_SYMBOL(__kmalloc_track_caller);
-/**
- * kmem_cache_free - Deallocate an object
- * @cachep: The cache the allocation was from.
- * @objp: The previously allocated object.
- *
- * Free an object which was previously allocated from this
- * cache.
- */
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+/* Caller is responsible for disabling local IRQs */
+static __always_inline void __kmem_cache_free(struct kmem_cache *cachep,
+ void *objp, bool irq_off_needed)
{
unsigned long flags;
+
+ /* Compiler need to remove irq_off_needed branch statements */
+ BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- local_irq_save(flags);
+ if (irq_off_needed)
+ local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
+ if (irq_off_needed)
+ local_irq_restore(flags);
+}
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ __kmem_cache_free(cachep, objp, true);
trace_kmem_cache_free(_RET_IP_, objp);
}
EXPORT_SYMBOL(kmem_cache_free);
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++)
+ __kmem_cache_free(s, p[i], false);
+ local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
diff --git a/mm/slub.c b/mm/slub.c
index 7cb4bf9ae320..438ebf8bbab1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -1065,11 +1067,15 @@ bad:
return 0;
}
+/* Supports checking bulk free of a constructed freelist */
static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
unsigned long addr, unsigned long *flags)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
spin_lock_irqsave(&n->list_lock, *flags);
slab_lock(page);
@@ -1077,6 +1083,9 @@ static noinline struct kmem_cache_node *free_debug_processing(
if (!check_slab(s, page))
goto fail;
+next_object:
+ cnt++;
+
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
goto fail;
@@ -1107,8 +1116,19 @@ static noinline struct kmem_cache_node *free_debug_processing(
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
out:
+ if (cnt != bulk_cnt)
+ slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+ bulk_cnt, cnt);
+
slab_unlock(page);
/*
* Keep node_lock to preserve integrity
@@ -1204,7 +1224,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
return flags;
}
-#else
+#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
@@ -1212,7 +1232,8 @@ static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
static inline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
unsigned long addr, unsigned long *flags) { return NULL; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
@@ -1308,6 +1329,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kasan_slab_free(s, x);
}
+static inline void slab_free_freelist_hook(struct kmem_cache *s,
+ void *head, void *tail)
+{
+/*
+ * Compiler cannot detect this function can be removed if slab_free_hook()
+ * evaluates to nothing. Thus, catch all relevant config debug options here.
+ */
+#if defined(CONFIG_KMEMCHECK) || \
+ defined(CONFIG_LOCKDEP) || \
+ defined(CONFIG_DEBUG_KMEMLEAK) || \
+ defined(CONFIG_DEBUG_OBJECTS_FREE) || \
+ defined(CONFIG_KASAN)
+
+ void *object = head;
+ void *tail_obj = tail ? : head;
+
+ do {
+ slab_free_hook(s, object);
+ } while ((object != tail_obj) &&
+ (object = get_freepointer(s, object)));
+#endif
+}
+
static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
@@ -2295,23 +2339,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* And if we were unable to get a new slab from the partial slab lists then
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
*/
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
- unsigned long flags;
-
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
- /*
- * We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
- * pointer.
- */
- c = this_cpu_ptr(s->cpu_slab);
-#endif
page = c->page;
if (!page)
@@ -2369,7 +2405,6 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
return freelist;
new_slab:
@@ -2386,7 +2421,6 @@ new_slab:
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
return NULL;
}
@@ -2402,11 +2436,35 @@ new_slab:
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
- local_irq_restore(flags);
return freelist;
}
/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+ p = ___slab_alloc(s, gfpflags, node, addr, c);
+ local_irq_restore(flags);
+ return p;
+}
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -2569,10 +2627,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, unsigned long addr)
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+
{
void *prior;
- void **object = (void *)x;
int was_frozen;
struct page new;
unsigned long counters;
@@ -2582,7 +2641,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, x, addr, &flags)))
+ !(n = free_debug_processing(s, page, head, tail, cnt,
+ addr, &flags)))
return;
do {
@@ -2592,10 +2652,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
prior = page->freelist;
counters = page->counters;
- set_freepointer(s, object, prior);
+ set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
- new.inuse--;
+ new.inuse -= cnt;
if ((!new.inuse || !prior) && !was_frozen) {
if (kmem_cache_has_cpu_partial(s) && !prior) {
@@ -2626,7 +2686,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
} while (!cmpxchg_double_slab(s, page,
prior, counters,
- object, new.counters,
+ head, new.counters,
"__slab_free"));
if (likely(!n)) {
@@ -2691,15 +2751,20 @@ slab_empty:
*
* If fastpath is not possible then fall back to __slab_free where we deal
* with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
*/
-static __always_inline void slab_free(struct kmem_cache *s,
- struct page *page, void *x, unsigned long addr)
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
{
- void **object = (void *)x;
+ void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
- slab_free_hook(s, x);
+ slab_free_freelist_hook(s, head, tail);
redo:
/*
@@ -2718,19 +2783,19 @@ redo:
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, object, c->freelist);
+ set_freepointer(s, tail_obj, c->freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
- object, next_tid(tid)))) {
+ head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, x, addr);
+ __slab_free(s, page, head, tail_obj, cnt, addr);
}
@@ -2739,49 +2804,98 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
s = cache_from_obj(s, x);
if (!s)
return;
- slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+ slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
- struct kmem_cache_cpu *c;
+struct detached_freelist {
struct page *page;
- int i;
+ void *tail;
+ void *freelist;
+ int cnt;
+};
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page. It builds a detached freelist directly within the given
+ * page/objects. This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive. Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
+{
+ size_t first_skipped_index = 0;
+ int lookahead = 3;
+ void *object;
- for (i = 0; i < size; i++) {
- void *object = p[i];
+ /* Always re-init detached_freelist */
+ df->page = NULL;
- BUG_ON(!object);
- /* kmem cache debug support */
- s = cache_from_obj(s, object);
- if (unlikely(!s))
- goto exit;
- slab_free_hook(s, object);
+ do {
+ object = p[--size];
+ } while (!object && size);
- page = virt_to_head_page(object);
+ if (!object)
+ return 0;
- if (c->page == page) {
- /* Fastpath: local CPU free */
- set_freepointer(s, object, c->freelist);
- c->freelist = object;
- } else {
- c->tid = next_tid(c->tid);
- local_irq_enable();
- /* Slowpath: overhead locked cmpxchg_double_slab */
- __slab_free(s, page, object, _RET_IP_);
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ /* Start new detached freelist */
+ set_freepointer(s, object, NULL);
+ df->page = virt_to_head_page(object);
+ df->tail = object;
+ df->freelist = object;
+ p[size] = NULL; /* mark object processed */
+ df->cnt = 1;
+
+ while (size) {
+ object = p[--size];
+ if (!object)
+ continue; /* Skip processed objects */
+
+ /* df->page is always set at this point */
+ if (df->page == virt_to_head_page(object)) {
+ /* Opportunity build freelist */
+ set_freepointer(s, object, df->freelist);
+ df->freelist = object;
+ df->cnt++;
+ p[size] = NULL; /* mark object processed */
+
+ continue;
}
+
+ /* Limit look ahead search */
+ if (!--lookahead)
+ break;
+
+ if (!first_skipped_index)
+ first_skipped_index = size + 1;
}
-exit:
- c->tid = next_tid(c->tid);
- local_irq_enable();
+
+ return first_skipped_index;
+}
+
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ if (WARN_ON(!size))
+ return;
+
+ do {
+ struct detached_freelist df;
+
+ size = build_detached_freelist(s, size, p, &df);
+ if (unlikely(!df.page))
+ continue;
+
+ slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ } while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -2804,30 +2918,23 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void *object = c->freelist;
if (unlikely(!object)) {
- local_irq_enable();
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
- p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
_RET_IP_, c);
- if (unlikely(!p[i])) {
- __kmem_cache_free_bulk(s, i, p);
- return false;
- }
- local_irq_disable();
+ if (unlikely(!p[i]))
+ goto error;
+
c = this_cpu_ptr(s->cpu_slab);
continue; /* goto for-loop */
}
/* kmem_cache debug support */
s = slab_pre_alloc_hook(s, flags);
- if (unlikely(!s)) {
- __kmem_cache_free_bulk(s, i, p);
- c->tid = next_tid(c->tid);
- local_irq_enable();
- return false;
- }
+ if (unlikely(!s))
+ goto error;
c->freelist = get_freepointer(s, object);
p[i] = object;
@@ -2847,6 +2954,11 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
return true;
+
+error:
+ __kmem_cache_free_bulk(s, i, p);
+ local_irq_enable();
+ return false;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3511,7 +3623,7 @@ void kfree(const void *x)
__free_kmem_pages(page, compound_order(page));
return;
}
- slab_free(page->slab_cache, page, object, _RET_IP_);
+ slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..abffc33bb975 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -89,260 +89,14 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != compound_head(page), page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
- }
- return;
- }
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ __put_compound_page(page);
+ else
__put_single_page(page);
}
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
- unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
-
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
- }
- }
-
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
- }
- return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
@@ -604,6 +358,7 @@ static void __lru_cache_activate_page(struct page *page)
*/
void mark_page_accessed(struct page *page)
{
+ page = compound_head(page);
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
@@ -918,15 +673,6 @@ void release_pages(struct page **pages, int nr, bool cold)
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- put_compound_page(page);
- continue;
- }
-
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -937,9 +683,19 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ page = compound_head(page);
if (!put_page_testzero(page))
continue;
+ if (PageCompound(page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb7fa5f..d783872d746c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -353,7 +353,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -367,7 +367,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58877312cf6b..7073faecb38f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -929,6 +929,9 @@ int reuse_swap_page(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
+ /* The page is part of THP and cannot be reused */
+ if (PageTransCompound(page))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -1145,14 +1148,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1163,11 +1167,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, true);
+ page_add_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..806b0c758c5b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_uncharge_unlock;
inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
out_release:
page_cache_release(page);
goto out;
diff --git a/mm/util.c b/mm/util.c
index 9af1c12b310c..22dae03a4ae1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,11 +400,25 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+ int ret;
+
+ page = compound_head(page);
+ ret = atomic_read(&page->_mapcount) + 1;
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ return ret;
}
+EXPORT_SYMBOL_GPL(__page_mapcount);
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..4c25e621a40b 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -38,7 +38,7 @@
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 2;
/*
* These thresholds are used when we account memory pressure through
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2aec4241b42a..a4507ecaefbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1184,7 +1184,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 879a2be23325..45dcbcb5c594 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -842,7 +842,9 @@ const char * const vmstat_text[] = {
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
- "thp_split",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2b3c22808c3b..9f0949b28445 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -433,6 +433,28 @@ our @typeList = (
qr{${Ident}_handler_fn},
@typeListMisordered,
);
+
+our $C90_int_types = qr{(?x:
+ long\s+long\s+int\s+(?:un)?signed|
+ long\s+long\s+(?:un)?signed\s+int|
+ long\s+long\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?long\s+long\s+int|
+ (?:(?:un)?signed\s+)?long\s+long|
+ int\s+long\s+long\s+(?:un)?signed|
+ int\s+(?:(?:un)?signed\s+)?long\s+long|
+
+ long\s+int\s+(?:un)?signed|
+ long\s+(?:un)?signed\s+int|
+ long\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?long\s+int|
+ (?:(?:un)?signed\s+)?long|
+ int\s+long\s+(?:un)?signed|
+ int\s+(?:(?:un)?signed\s+)?long|
+
+ int\s+(?:un)?signed|
+ (?:(?:un)?signed\s+)?int
+)};
+
our @typeListFile = ();
our @typeListWithAttr = (
@typeList,
@@ -4517,7 +4539,7 @@ sub process {
#print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n";
$has_flow_statement = 1 if ($ctx =~ /\b(goto|return)\b/);
- $has_arg_concat = 1 if ($ctx =~ /\#\#/);
+ $has_arg_concat = 1 if ($ctx =~ /\#\#/ && $ctx !~ /\#\#\s*(?:__VA_ARGS__|args)\b/);
$dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//;
$dstat =~ s/$;//g;
@@ -5241,6 +5263,26 @@ sub process {
}
}
+# check for cast of C90 native int or longer types constants
+ if ($line =~ /(\(\s*$C90_int_types\s*\)\s*)($Constant)\b/) {
+ my $cast = $1;
+ my $const = $2;
+ if (WARN("TYPECAST_INT_CONSTANT",
+ "Unnecessary typecast of c90 int constant\n" . $herecurr) &&
+ $fix) {
+ my $suffix = "";
+ my $newconst = $const;
+ $newconst =~ s/${Int_type}$//;
+ $suffix .= 'U' if ($cast =~ /\bunsigned\b/);
+ if ($cast =~ /\blong\s+long\b/) {
+ $suffix .= 'LL';
+ } elsif ($cast =~ /\blong\b/) {
+ $suffix .= 'L';
+ }
+ $fixed[$fixlinenr] =~ s/\Q$cast\E$const\b/$newconst$suffix/;
+ }
+ }
+
# check for sizeof(&)
if ($line =~ /\bsizeof\s*\(\s*\&/) {
WARN("SIZEOF_ADDRESS",