aboutsummaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c196
1 files changed, 79 insertions, 117 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddbc839..13b9d0f221b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,6 +54,7 @@
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/lockdep.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -866,6 +867,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
+ get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +875,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
+ put_online_cpus();
return val;
}
@@ -2044,6 +2047,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
return total;
}
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
@@ -2081,7 +2090,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
}
iter->oom_lock = false;
}
- }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
@@ -2093,6 +2103,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2159,110 +2170,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
memcg_wakeup_oom(memcg);
}
-/*
- * try to call OOM killer
- */
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- bool locked;
- int wakeups;
-
if (!current->memcg_oom.may_oom)
return;
-
- current->memcg_oom.in_memcg_oom = 1;
-
/*
- * As with any blocking lock, a contender needs to start
- * listening for wakeups before attempting the trylock,
- * otherwise it can miss the wakeup from the unlock and sleep
- * indefinitely. This is just open-coded because our locking
- * is so particular to memcg hierarchies.
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * Also, the caller may handle a failed allocation gracefully
+ * (like optional page cache readahead) and so an OOM killer
+ * invocation might not even be necessary.
+ *
+ * That's why we don't do anything here except remember the
+ * OOM context and then deal with it at the end of the page
+ * fault when the stack is unwound, the locks are released,
+ * and when we know whether the fault was overall successful.
*/
- wakeups = atomic_read(&memcg->oom_wakeups);
- mem_cgroup_mark_under_oom(memcg);
-
- locked = mem_cgroup_oom_trylock(memcg);
-
- if (locked)
- mem_cgroup_oom_notify(memcg);
-
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
- mem_cgroup_out_of_memory(memcg, mask, order);
- mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- } else {
- /*
- * A system call can just return -ENOMEM, but if this
- * is a page fault and somebody else is handling the
- * OOM already, we need to sleep on the OOM waitqueue
- * for this memcg until the situation is resolved.
- * Which can take some time because it might be
- * handled by a userspace task.
- *
- * However, this is the charge context, which means
- * that we may sit on a large call stack and hold
- * various filesystem locks, the mmap_sem etc. and we
- * don't want the OOM handler to deadlock on them
- * while we sit here and wait. Store the current OOM
- * context in the task_struct, then return -ENOMEM.
- * At the end of the page fault handler, with the
- * stack unwound, pagefault_out_of_memory() will check
- * back with us by calling
- * mem_cgroup_oom_synchronize(), possibly putting the
- * task to sleep.
- */
- current->memcg_oom.oom_locked = locked;
- current->memcg_oom.wakeups = wakeups;
- css_get(&memcg->css);
- current->memcg_oom.wait_on_memcg = memcg;
- }
+ css_get(&memcg->css);
+ current->memcg_oom.memcg = memcg;
+ current->memcg_oom.gfp_mask = mask;
+ current->memcg_oom.order = order;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
*
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
*
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
*/
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
{
+ struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
- struct mem_cgroup *memcg;
+ bool locked;
/* OOM is global, do not handle */
- if (!current->memcg_oom.in_memcg_oom)
- return false;
-
- /*
- * We invoked the OOM killer but there is a chance that a kill
- * did not free up any charges. Everybody else might already
- * be sleeping, so restart the fault and keep the rampage
- * going until some charges are released.
- */
- memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)
- goto out;
+ return false;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
- goto out_memcg;
+ if (!handle)
+ goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
@@ -2271,13 +2231,25 @@ bool mem_cgroup_oom_synchronize(void)
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- /* Only sleep if we didn't miss any wakeups since OOM */
- if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+ current->memcg_oom.order);
+ } else {
schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
- mem_cgroup_unmark_under_oom(memcg);
- if (current->memcg_oom.oom_locked) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
@@ -2286,10 +2258,9 @@ out_memcg:
*/
memcg_oom_recover(memcg);
}
+cleanup:
+ current->memcg_oom.memcg = NULL;
css_put(&memcg->css);
- current->memcg_oom.wait_on_memcg = NULL;
-out:
- current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2703,6 +2674,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|| fatal_signal_pending(current)))
goto bypass;
+ if (unlikely(task_in_memcg_oom(current)))
+ goto bypass;
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
@@ -2800,8 +2774,10 @@ done:
*ptr = memcg;
return 0;
nomem:
- *ptr = NULL;
- return -ENOMEM;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ *ptr = NULL;
+ return -ENOMEM;
+ }
bypass:
*ptr = root_mem_cgroup;
return -EINTR;
@@ -3806,8 +3782,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
{
/* Update stat data for mem_cgroup */
preempt_disable();
- WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
- __this_cpu_add(from->stat->count[idx], -nr_pages);
+ __this_cpu_sub(from->stat->count[idx], nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
@@ -4983,31 +4958,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
} while (usage > 0);
}
-/*
- * This mainly exists for tests during the setting of set of use_hierarchy.
- * Since this is the very setting we are changing, the current hierarchy value
- * is meaningless
- */
-static inline bool __memcg_has_children(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *pos;
-
- /* bounce at first found */
- css_for_each_child(pos, &memcg->css)
- return true;
- return false;
-}
-
-/*
- * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
- * to be already dead (as in mem_cgroup_force_empty, for instance). This is
- * from mem_cgroup_count_children(), in the sense that we don't really care how
- * many children we have; we only need to know if we have any. It also counts
- * any memcg without hierarchy as infertile.
- */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
- return memcg->use_hierarchy && __memcg_has_children(memcg);
+ lockdep_assert_held(&memcg_create_mutex);
+ /*
+ * The lock does not prevent addition or deletion to the list
+ * of children, but it prevents a new child from being
+ * initialized based on this parent in css_online(), so it's
+ * enough to decide whether hierarchically inherited
+ * attributes can still be changed or not.
+ */
+ return memcg->use_hierarchy &&
+ !list_empty(&memcg->css.cgroup->children);
}
/*
@@ -5087,7 +5049,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (!__memcg_has_children(memcg))
+ if (list_empty(&memcg->css.cgroup->children))
memcg->use_hierarchy = val;
else
retval = -EBUSY;