From 8b5a5a9dbca914d1f7d70276024d1525a3c94081 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: deprecate remount option changes

This patch marks the following features for deprecation.

* Rebinding subsys by remount: Never reached useful state - only works
  on empty hierarchies.

* release_agent update by remount: release_agent itself will be
  replaced with conventional fsnotify notification.

v2: Lennart pointed out that "name=" is necessary for mounts w/o any
    controller attached.  Drop "name=" deprecation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Lennart Poettering <mzxreary@0pointer.de>
---
 Documentation/feature-removal-schedule.txt | 9 +++++++++
 kernel/cgroup.c                            | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index c1be8066ea5..5298e126cea 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -539,3 +539,12 @@ Why:	There appear to be no production users of the get_robust_list syscall,
 	of ASLR. It was only ever intended for debugging, so it should be
 	removed.
 Who:	Kees Cook <keescook@chromium.org>
+
+----------------------------
+
+What:	cgroup option updates via remount
+When:	March 2013
+Why:	Remount currently allows changing bound subsystems and
+	release_agent.  Rebinding is hardly useful as it only works
+	when the hierarchy is empty and release_agent itself should be
+	replaced with conventional fsnotify.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c..741c120e06c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1294,6 +1294,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
+	/* See feature-removal-schedule.txt */
+	if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+			   task_tgid_nr(current), current->comm);
+
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
-- 
cgit v1.2.3


From ff4c8d503e2583b485da46d0ec3dcc29639ab889 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: move cgroup_clear_directory() call out of
 cgroup_populate_dir()

cgroup_populate_dir() currently clears all files and then repopulate
the directory; however, the clearing part is only useful when it's
called from cgroup_remount().  Relocate the invocation to
cgroup_remount().

This is to prepare for further cgroup file handling updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 741c120e06c..8f72853131d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1313,7 +1313,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 	}
 
-	/* (re)populate subsystem files */
+	/* clear out any existing files and repopulate subsystem files */
+	cgroup_clear_directory(cgrp->dentry);
 	cgroup_populate_dir(cgrp);
 
 	if (opts.release_agent)
@@ -3644,9 +3645,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	/* First clear out any existing files */
-	cgroup_clear_directory(cgrp->dentry);
-
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
-- 
cgit v1.2.3


From b0ca5a84fc3ad8f75bb2b7ac3dc6a77151cd3906 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: build list of all cgroups under a given cgroupfs_root

Build a list of all cgroups anchored at cgroupfs_root->allcg_list and
going through cgroup->allcg_node.  The list is protected by
cgroup_mutex and will be used to improve cgroup file handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5a85b3415c1..ad2a14680b7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -191,6 +191,8 @@ struct cgroup {
 	 */
 	struct list_head css_sets;
 
+	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
+
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8f72853131d..db4319c030d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -127,6 +127,9 @@ struct cgroupfs_root {
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 
+	/* All cgroups on this root, cgroup_mutex protected */
+	struct list_head allcg_list;
+
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
@@ -1350,11 +1353,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
+
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	init_cgroup_housekeeping(cgrp);
 }
 
@@ -3790,6 +3796,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+
 	err = cgroup_populate_dir(cgrp);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 
@@ -3998,6 +4006,8 @@ again:
 	list_del_init(&cgrp->sibling);
 	cgroup_unlock_hierarchy(cgrp->root);
 
+	list_del_init(&cgrp->allcg_node);
+
 	d = dget(cgrp->dentry);
 
 	cgroup_d_remove_dir(d);
-- 
cgit v1.2.3


From 8e3f6541d45e1a002801e56a19530a90f775deba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: implement cgroup_add_cftypes() and friends

Currently, cgroup directories are populated by subsys->populate()
callback explicitly creating files on each cgroup creation.  This
level of flexibility isn't needed or desirable.  It provides largely
unused flexibility which call for abuses while severely limiting what
the core layer can do through the lack of structure and conventions.

Per each cgroup file type, the only distinction that cgroup users is
making is whether a cgroup is root or not, which can easily be
expressed with flags.

This patch introduces cgroup_add_cftypes().  These deal with cftypes
instead of individual files - controllers indicate that certain types
of files exist for certain subsystem.  Newly added CFTYPE_*_ON_ROOT
flags indicate whether a cftype should be excluded or created only on
the root cgroup.

cgroup_add_cftypes() can be called any time whether the target
subsystem is currently attached or not.  cgroup core will create files
on the existing cgroups as necessary.

Also, cgroup_subsys->base_cftypes is added to ease registration of the
base files for the subsystem.  If non-NULL on subsys init, the cftypes
pointed to by ->base_cftypes are automatically registered on subsys
init / load.

Further patches will convert the existing users and remove the file
based interface.  Note that this interface allows dynamic addition of
files to an active controller.  This will be used for sub-controller
modularity and unified hierarchy in the longer term.

This patch implements the new mechanism but doesn't apply it to any
user.

v2: replaced DECLARE_CGROUP_CFTYPES[_COND]() with
    cgroup_subsys->base_cftypes, which works better for cgroup_subsys
    which is loaded as module.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  33 ++++++++++++-
 kernel/cgroup.c        | 132 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 162 insertions(+), 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ad2a14680b7..af6211c7a42 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -192,6 +192,7 @@ struct cgroup {
 	struct list_head css_sets;
 
 	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
+	struct list_head cft_q_node;	/* used during cftype add/rm */
 
 	/*
 	 * Linked list running through all cgroups that can
@@ -277,11 +278,17 @@ struct cgroup_map_cb {
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 
-#define MAX_CFTYPE_NAME 64
+/* cftype->flags */
+#define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
+#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create onp root cg */
+
+#define MAX_CFTYPE_NAME		64
+
 struct cftype {
 	/*
 	 * By convention, the name should begin with the name of the
-	 * subsystem, followed by a period
+	 * subsystem, followed by a period.  Zero length string indicates
+	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
@@ -297,6 +304,9 @@ struct cftype {
 	 */
 	size_t max_write_len;
 
+	/* CFTYPE_* flags */
+	unsigned int flags;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
@@ -375,6 +385,16 @@ struct cftype {
 			struct eventfd_ctx *eventfd);
 };
 
+/*
+ * cftype_sets describe cftypes belonging to a subsystem and are chained at
+ * cgroup_subsys->cftsets.  Each cftset points to an array of cftypes
+ * terminated by zero length name.
+ */
+struct cftype_set {
+	struct list_head		node;	/* chained at subsys->cftsets */
+	const struct cftype		*cfts;
+};
+
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
@@ -400,6 +420,8 @@ int cgroup_add_files(struct cgroup *cgrp,
 			const struct cftype cft[],
 			int count);
 
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
+
 int cgroup_is_removed(const struct cgroup *cgrp);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
@@ -502,6 +524,13 @@ struct cgroup_subsys {
 	struct idr idr;
 	spinlock_t id_lock;
 
+	/* list of cftype_sets */
+	struct list_head cftsets;
+
+	/* base cftypes, automatically [de]registered with subsys itself */
+	struct cftype *base_cftypes;
+	struct cftype_set base_cftset;
+
 	/* should be defined only by modular subsystems */
 	struct module *module;
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index db4319c030d..df8fb82ef35 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2623,8 +2623,14 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dentry;
 	int error;
 	umode_t mode;
-
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+
+	/* does @cft->flags tell us to skip creation on @cgrp? */
+	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+		return 0;
+	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+		return 0;
+
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2660,6 +2666,95 @@ int cgroup_add_files(struct cgroup *cgrp,
 }
 EXPORT_SYMBOL_GPL(cgroup_add_files);
 
+static DEFINE_MUTEX(cgroup_cft_mutex);
+
+static void cgroup_cfts_prepare(void)
+	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+{
+	/*
+	 * Thanks to the entanglement with vfs inode locking, we can't walk
+	 * the existing cgroups under cgroup_mutex and create files.
+	 * Instead, we increment reference on all cgroups and build list of
+	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+	 * exclusive access to the field.
+	 */
+	mutex_lock(&cgroup_cft_mutex);
+	mutex_lock(&cgroup_mutex);
+}
+
+static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+			       const struct cftype *cfts)
+	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+{
+	LIST_HEAD(pending);
+	struct cgroup *cgrp, *n;
+	int count = 0;
+
+	while (cfts[count].name[0] != '\0')
+		count++;
+
+	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
+	if (cfts && ss->root != &rootnode) {
+		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+			dget(cgrp->dentry);
+			list_add_tail(&cgrp->cft_q_node, &pending);
+		}
+	}
+
+	mutex_unlock(&cgroup_mutex);
+
+	/*
+	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+	 * files for all cgroups which were created before.
+	 */
+	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+		struct inode *inode = cgrp->dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_mutex);
+		if (!cgroup_is_removed(cgrp))
+			cgroup_add_files(cgrp, ss, cfts, count);
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
+
+		list_del_init(&cgrp->cft_q_node);
+		dput(cgrp->dentry);
+	}
+
+	mutex_unlock(&cgroup_cft_mutex);
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss.  Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too.  This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure.  Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+	struct cftype_set *set;
+
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
+	if (!set)
+		return -ENOMEM;
+
+	cgroup_cfts_prepare();
+	set->cfts = cfts;
+	list_add_tail(&set->node, &ss->cftsets);
+	cgroup_cfts_commit(ss, cfts);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
@@ -3660,10 +3755,25 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 			return err;
 	}
 
+	/* process cftsets of each subsystem */
 	for_each_subsys(cgrp->root, ss) {
+		struct cftype_set *set;
+
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
+
+		list_for_each_entry(set, &ss->cftsets, node) {
+			const struct cftype *cft;
+
+			for (cft = set->cfts; cft->name[0] != '\0'; cft++) {
+				err = cgroup_add_file(cgrp, ss, cft);
+				if (err)
+					pr_warning("cgroup_populate_dir: failed to create %s, err=%d\n",
+						   cft->name, err);
+			}
+		}
 	}
+
 	/* This cgroup is ready now */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -4034,12 +4144,29 @@ again:
 	return 0;
 }
 
+static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+{
+	INIT_LIST_HEAD(&ss->cftsets);
+
+	/*
+	 * base_cftset is embedded in subsys itself, no need to worry about
+	 * deregistration.
+	 */
+	if (ss->base_cftypes) {
+		ss->base_cftset.cfts = ss->base_cftypes;
+		list_add_tail(&ss->base_cftset.node, &ss->cftsets);
+	}
+}
+
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
+	/* init base cftset */
+	cgroup_init_cftsets(ss);
+
 	/* Create the top cgroup state for this subsystem */
 	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
@@ -4109,6 +4236,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		return 0;
 	}
 
+	/* init base cftset */
+	cgroup_init_cftsets(ss);
+
 	/*
 	 * need to register a subsys id before anything else - for example,
 	 * init_cgroup_css needs it.
-- 
cgit v1.2.3


From 6e6ff25bd548a0c5bf5163e4f63e2793dcefbdcb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: merge cft_release_agent cftype array into the base files
 array

Now that cftype can express whether a file should only be on root,
cft_release_agent can be merged into the base files cftypes array.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index df8fb82ef35..66c1e3b5c0f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3732,13 +3732,13 @@ static struct cftype files[] = {
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
-};
-
-static struct cftype cft_release_agent = {
-	.name = "release_agent",
-	.read_seq_string = cgroup_release_agent_show,
-	.write_string = cgroup_release_agent_write,
-	.max_write_len = PATH_MAX,
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cgroup_release_agent_show,
+		.write_string = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX,
+	},
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3750,11 +3750,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	if (err < 0)
 		return err;
 
-	if (cgrp == cgrp->top_cgroup) {
-		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
-			return err;
-	}
-
 	/* process cftsets of each subsystem */
 	for_each_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
-- 
cgit v1.2.3


From 676f7c8f84d15e94065841529016da5ab92e901b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: relocate cftype and cgroup_subsys definitions in controllers

blk-cgroup, netprio_cgroup, cls_cgroup and tcp_memcontrol
unnecessarily define cftype array and cgroup_subsys structures at the
top of the file, which is unconventional and necessiates forward
declaration of methods.

This patch relocates those below the definitions of the methods and
removes the forward declarations.  Note that forward declaration of
tcp_files[] is added in tcp_memcontrol.c for tcp_init_cgroup().  This
will be removed soon by another patch.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 block/blk-cgroup.c        | 38 +++++++++++++------------------
 net/core/netprio_cgroup.c | 26 +++++++++------------
 net/ipv4/tcp_memcontrol.c | 57 ++++++++++++++++++++++-------------------------
 net/sched/cls_cgroup.c    | 27 +++++++++-------------
 4 files changed, 65 insertions(+), 83 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ea84a23d5e6..60ef16d6d15 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -28,34 +28,12 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
-static struct cgroup_subsys_state *blkiocg_create(struct cgroup *);
-static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *);
-static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *);
-static void blkiocg_destroy(struct cgroup *);
-static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
-
 /* for encoding cft->private value on file */
 #define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 /* What policy owns the file, proportional or throttle */
 #define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
 #define BLKIOFILE_ATTR(val)		((val) & 0xffff)
 
-struct cgroup_subsys blkio_subsys = {
-	.name = "blkio",
-	.create = blkiocg_create,
-	.can_attach = blkiocg_can_attach,
-	.attach = blkiocg_attach,
-	.destroy = blkiocg_destroy,
-	.populate = blkiocg_populate,
-#ifdef CONFIG_BLK_CGROUP
-	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
-	.subsys_id = blkio_subsys_id,
-#endif
-	.use_id = 1,
-	.module = THIS_MODULE,
-};
-EXPORT_SYMBOL_GPL(blkio_subsys);
-
 static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
 					    struct blkio_policy_node *pn)
 {
@@ -1658,6 +1636,22 @@ static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 	}
 }
 
+struct cgroup_subsys blkio_subsys = {
+	.name = "blkio",
+	.create = blkiocg_create,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
+	.destroy = blkiocg_destroy,
+	.populate = blkiocg_populate,
+#ifdef CONFIG_BLK_CGROUP
+	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
+	.subsys_id = blkio_subsys_id,
+#endif
+	.use_id = 1,
+	.module = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(blkio_subsys);
+
 void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
 	spin_lock(&blkio_list_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index ba6900f7390..dbfd4e7f8dc 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -23,21 +23,6 @@
 #include <net/sock.h>
 #include <net/netprio_cgroup.h>
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup *cgrp);
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
-
-struct cgroup_subsys net_prio_subsys = {
-	.name		= "net_prio",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
-#ifdef CONFIG_NETPRIO_CGROUP
-	.subsys_id	= net_prio_subsys_id,
-#endif
-	.module		= THIS_MODULE
-};
-
 #define PRIOIDX_SZ 128
 
 static unsigned long prioidx_map[PRIOIDX_SZ];
@@ -264,6 +249,17 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
 }
 
+struct cgroup_subsys net_prio_subsys = {
+	.name		= "net_prio",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+#ifdef CONFIG_NETPRIO_CGROUP
+	.subsys_id	= net_prio_subsys_id,
+#endif
+	.module		= THIS_MODULE
+};
+
 static int netprio_device_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index e795272fbe9..4d3ee407d2d 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,36 +6,7 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
-static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft);
-static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
-			    const char *buffer);
-static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event);
-
-static struct cftype tcp_files[] = {
-	{
-		.name = "kmem.tcp.limit_in_bytes",
-		.write_string = tcp_cgroup_write,
-		.read_u64 = tcp_cgroup_read,
-		.private = RES_LIMIT,
-	},
-	{
-		.name = "kmem.tcp.usage_in_bytes",
-		.read_u64 = tcp_cgroup_read,
-		.private = RES_USAGE,
-	},
-	{
-		.name = "kmem.tcp.failcnt",
-		.private = RES_FAILCNT,
-		.trigger = tcp_cgroup_reset,
-		.read_u64 = tcp_cgroup_read,
-	},
-	{
-		.name = "kmem.tcp.max_usage_in_bytes",
-		.private = RES_MAX_USAGE,
-		.trigger = tcp_cgroup_reset,
-		.read_u64 = tcp_cgroup_read,
-	},
-};
+static struct cftype tcp_files[4];	/* XXX: will be removed soon */
 
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
@@ -270,3 +241,29 @@ void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
 
 	tcp->tcp_prot_mem[idx] = val;
 }
+
+static struct cftype tcp_files[] = {
+	{
+		.name = "kmem.tcp.limit_in_bytes",
+		.write_string = tcp_cgroup_write,
+		.read_u64 = tcp_cgroup_read,
+		.private = RES_LIMIT,
+	},
+	{
+		.name = "kmem.tcp.usage_in_bytes",
+		.read_u64 = tcp_cgroup_read,
+		.private = RES_USAGE,
+	},
+	{
+		.name = "kmem.tcp.failcnt",
+		.private = RES_FAILCNT,
+		.trigger = tcp_cgroup_reset,
+		.read_u64 = tcp_cgroup_read,
+	},
+	{
+		.name = "kmem.tcp.max_usage_in_bytes",
+		.private = RES_MAX_USAGE,
+		.trigger = tcp_cgroup_reset,
+		.read_u64 = tcp_cgroup_read,
+	},
+};
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 1afaa284fcd..024df8a3227 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -22,22 +22,6 @@
 #include <net/sock.h>
 #include <net/cls_cgroup.h>
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup *cgrp);
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
-
-struct cgroup_subsys net_cls_subsys = {
-	.name		= "net_cls",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
-#ifdef CONFIG_NET_CLS_CGROUP
-	.subsys_id	= net_cls_subsys_id,
-#endif
-	.module		= THIS_MODULE,
-};
-
-
 static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -93,6 +77,17 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
 }
 
+struct cgroup_subsys net_cls_subsys = {
+	.name		= "net_cls",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+#ifdef CONFIG_NET_CLS_CGROUP
+	.subsys_id	= net_cls_subsys_id,
+#endif
+	.module		= THIS_MODULE,
+};
+
 struct cls_cgroup_head {
 	u32			handle;
 	struct tcf_exts		exts;
-- 
cgit v1.2.3


From 4baf6e33251b37f111e21289f8ee71fe4cce236e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: convert all non-memcg controllers to the new cftype interface

Convert debug, freezer, cpuset, cpu_cgroup, cpuacct, net_prio, blkio,
net_cls and device controllers to use the new cftype based interface.
Termination entry is added to cftype arrays and populate callbacks are
replaced with cgroup_subsys->base_cftypes initializations.

This is functionally identical transformation.  There shouldn't be any
visible behavior change.

memcg is rather special and will be converted separately.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 block/blk-cgroup.c        |  9 ++-------
 kernel/cgroup.c           | 10 +++-------
 kernel/cgroup_freezer.c   | 11 +++--------
 kernel/cpuset.c           | 31 ++++++++++---------------------
 kernel/sched/core.c       | 16 ++++------------
 net/core/netprio_cgroup.c |  8 ++------
 net/sched/cls_cgroup.c    |  8 ++------
 security/device_cgroup.c  | 10 ++--------
 8 files changed, 28 insertions(+), 75 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 60ef16d6d15..126c341955d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1515,14 +1515,9 @@ struct cftype blkio_files[] = {
 		.read_map = blkiocg_file_read_map,
 	},
 #endif
+	{ }	/* terminate */
 };
 
-static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-	return cgroup_add_files(cgroup, subsys, blkio_files,
-				ARRAY_SIZE(blkio_files));
-}
-
 static void blkiocg_destroy(struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
@@ -1642,11 +1637,11 @@ struct cgroup_subsys blkio_subsys = {
 	.can_attach = blkiocg_can_attach,
 	.attach = blkiocg_attach,
 	.destroy = blkiocg_destroy,
-	.populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
 	.subsys_id = blkio_subsys_id,
 #endif
+	.base_cftypes = blkio_files,
 	.use_id = 1,
 	.module = THIS_MODULE,
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66c1e3b5c0f..835ae29e4ea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5349,19 +5349,15 @@ static struct cftype debug_files[] =  {
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
-};
 
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	return cgroup_add_files(cont, ss, debug_files,
-				ARRAY_SIZE(debug_files));
-}
+	{ }	/* terminate */
+};
 
 struct cgroup_subsys debug_subsys = {
 	.name = "debug",
 	.create = debug_create,
 	.destroy = debug_destroy,
-	.populate = debug_populate,
 	.subsys_id = debug_subsys_id,
+	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b6..3649fc6b3ea 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
 static struct cftype files[] = {
 	{
 		.name = "state",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = freezer_read,
 		.write_string = freezer_write,
 	},
+	{ }	/* terminate */
 };
 
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
-	if (!cgroup->parent)
-		return 0;
-	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
 struct cgroup_subsys freezer_subsys = {
 	.name		= "freezer",
 	.create		= freezer_create,
 	.destroy	= freezer_destroy,
-	.populate	= freezer_populate,
 	.subsys_id	= freezer_subsys_id,
 	.can_attach	= freezer_can_attach,
 	.fork		= freezer_fork,
+	.base_cftypes	= files,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b96ad75b7e6..2382683617a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_SPREAD_SLAB,
 	},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
-	.name = "memory_pressure_enabled",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_MEMORY_PRESSURE_ENABLED,
-};
 
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	int err;
+	{
+		.name = "memory_pressure_enabled",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_PRESSURE_ENABLED,
+	},
 
-	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-	if (err)
-		return err;
-	/* memory_pressure_enabled is in root cpuset only */
-	if (!cont->parent)
-		err = cgroup_add_file(cont, ss,
-				      &cft_memory_pressure_enabled);
-	return err;
-}
+	{ }	/* terminate */
+};
 
 /*
  * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
 	.destroy = cpuset_destroy,
 	.can_attach = cpuset_can_attach,
 	.attach = cpuset_attach,
-	.populate = cpuset_populate,
 	.post_clone = cpuset_post_clone,
 	.subsys_id = cpuset_subsys_id,
+	.base_cftypes = files,
 	.early_init = 1,
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30..afc6d7e7155 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7970,13 +7970,9 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
+	{ }	/* terminate */
 };
 
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
@@ -7984,8 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
-	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
+	.base_cftypes	= cpu_files,
 	.early_init	= 1,
 };
 
@@ -8170,13 +8166,9 @@ static struct cftype files[] = {
 		.name = "stat",
 		.read_map = cpuacct_stats_show,
 	},
+	{ }	/* terminate */
 };
 
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
 /*
  * charge this task's execution time to its accounting group.
  *
@@ -8208,7 +8200,7 @@ struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
-	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
+	.base_cftypes = files,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index dbfd4e7f8dc..b2a59866699 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -242,21 +242,17 @@ static struct cftype ss_files[] = {
 		.read_map = read_priomap,
 		.write_string = write_priomap,
 	},
+	{ }	/* terminate */
 };
 
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
 struct cgroup_subsys net_prio_subsys = {
 	.name		= "net_prio",
 	.create		= cgrp_create,
 	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
 #ifdef CONFIG_NETPRIO_CGROUP
 	.subsys_id	= net_prio_subsys_id,
 #endif
+	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE
 };
 
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 024df8a3227..7743ea8d1d3 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -70,21 +70,17 @@ static struct cftype ss_files[] = {
 		.read_u64 = read_classid,
 		.write_u64 = write_classid,
 	},
+	{ }	/* terminate */
 };
 
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
 struct cgroup_subsys net_cls_subsys = {
 	.name		= "net_cls",
 	.create		= cgrp_create,
 	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
 #ifdef CONFIG_NET_CLS_CGROUP
 	.subsys_id	= net_cls_subsys_id,
 #endif
+	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE,
 };
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c43a3323fee..442204cc22d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -447,22 +447,16 @@ static struct cftype dev_cgroup_files[] = {
 		.read_seq_string = devcgroup_seq_read,
 		.private = DEVCG_LIST,
 	},
+	{ }	/* terminate */
 };
 
-static int devcgroup_populate(struct cgroup_subsys *ss,
-				struct cgroup *cgroup)
-{
-	return cgroup_add_files(cgroup, ss, dev_cgroup_files,
-					ARRAY_SIZE(dev_cgroup_files));
-}
-
 struct cgroup_subsys devices_subsys = {
 	.name = "devices",
 	.can_attach = devcgroup_can_attach,
 	.create = devcgroup_create,
 	.destroy = devcgroup_destroy,
-	.populate = devcgroup_populate,
 	.subsys_id = devices_subsys_id,
+	.base_cftypes = dev_cgroup_files,
 };
 
 int __devcgroup_inode_permission(struct inode *inode, int mask)
-- 
cgit v1.2.3


From af36f906c0f4c2ffa0482ecdf856a33dc88ae8c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: memcg: always create memsw files if CONFIG_CGROUP_MEM_RES_CTLR_SWAP

Instead of conditioning creation of memsw files on do_swap_account,
always create the files if compiled-in and fail read/write attempts
with -EOPNOTSUPP if !do_swap_account.

This is suggested by KAMEZAWA to simplify memcg file creation so that
it can use cgroup->subsys_cftypes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 mm/memcontrol.c | 65 +++++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df4a06..97a45b392f7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3879,14 +3879,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 	return val << PAGE_SHIFT;
 }
 
-static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
+static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
+			       struct file *file, char __user *buf,
+			       size_t nbytes, loff_t *ppos)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
 	u64 val;
-	int type, name;
+	int type, name, len;
 
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
+
+	if (!do_swap_account && type == _MEMSWAP)
+		return -EOPNOTSUPP;
+
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
@@ -3903,7 +3910,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 	default:
 		BUG();
 	}
-	return val;
+
+	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 /*
  * The user of this function is...
@@ -3919,6 +3928,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
+
+	if (!do_swap_account && type == _MEMSWAP)
+		return -EOPNOTSUPP;
+
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3984,12 +3997,15 @@ out:
 
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 
-	memcg = mem_cgroup_from_cont(cont);
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
+
+	if (!do_swap_account && type == _MEMSWAP)
+		return -EOPNOTSUPP;
+
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
@@ -4655,7 +4671,7 @@ static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
@@ -4663,25 +4679,25 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
@@ -4721,14 +4737,11 @@ static struct cftype mem_cgroup_files[] = {
 		.mode = S_IRUGO,
 	},
 #endif
-};
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
@@ -4736,35 +4749,22 @@ static struct cftype memsw_cgroup_files[] = {
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
-		.read_u64 = mem_cgroup_read,
+		.read = mem_cgroup_read,
 	},
-};
-
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
-	if (!do_swap_account)
-		return 0;
-	return cgroup_add_files(cont, ss, memsw_cgroup_files,
-				ARRAY_SIZE(memsw_cgroup_files));
-};
-#else
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
-	return 0;
-}
 #endif
+};
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
@@ -5046,9 +5046,6 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
 				ARRAY_SIZE(mem_cgroup_files));
 
-	if (!ret)
-		ret = register_memsw_files(cont, ss);
-
 	if (!ret)
 		ret = register_kmem_files(cont, ss);
 
-- 
cgit v1.2.3


From 6bc103498f5fe512928496fc7802d639cc2d1d20 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: convert memcg controller to the new cftype interface

Convert memcg to use the new cftype based interface.  kmem support
abuses ->populate() for mem_cgroup_sockets_init() so it can't be
removed at the moment.

tcp_memcontrol is updated so that tcp_files[] is registered via a
__initcall.  This change also allows removing the forward declaration
of tcp_files[].  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Greg Thelen <gthelen@google.com>
---
 mm/memcontrol.c           | 12 +++---------
 net/ipv4/tcp_memcontrol.c | 16 ++++++++++------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 97a45b392f7..bef114258bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4764,6 +4764,7 @@ static struct cftype mem_cgroup_files[] = {
 		.read = mem_cgroup_read,
 	},
 #endif
+	{ },	/* terminate */
 };
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
@@ -5041,15 +5042,7 @@ static void mem_cgroup_destroy(struct cgroup *cont)
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	int ret;
-
-	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
-				ARRAY_SIZE(mem_cgroup_files));
-
-	if (!ret)
-		ret = register_kmem_files(cont, ss);
-
-	return ret;
+	return register_kmem_files(cont, ss);
 }
 
 #ifdef CONFIG_MMU
@@ -5639,6 +5632,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
+	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
 };
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 4d3ee407d2d..8f1753defa5 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,8 +6,6 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
-static struct cftype tcp_files[4];	/* XXX: will be removed soon */
-
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -36,7 +34,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
-		goto create_files;
+		return 0;
 
 	tcp = tcp_from_cgproto(cg_proto);
 
@@ -59,9 +57,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
 	cg_proto->memcg = memcg;
 
-create_files:
-	return cgroup_add_files(cgrp, ss, tcp_files,
-				ARRAY_SIZE(tcp_files));
+	return 0;
 }
 EXPORT_SYMBOL(tcp_init_cgroup);
 
@@ -266,4 +262,12 @@ static struct cftype tcp_files[] = {
 		.trigger = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
+	{ }	/* terminate */
 };
+
+static int __init tcp_memcontrol_init(void)
+{
+	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+	return 0;
+}
+__initcall(tcp_memcontrol_init);
-- 
cgit v1.2.3


From db0416b64977cb0f382175608286cc80c7573e38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: remove cgroup_add_file[s]()

No controller is using cgroup_add_files[s]().  Unexport them, and
convert cgroup_add_files() to handle NULL entry terminated array
instead of taking count explicitly and continue creation on failure
for internal use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h | 16 ----------------
 kernel/cgroup.c        | 51 ++++++++++++++++++++------------------------------
 2 files changed, 20 insertions(+), 47 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index af6211c7a42..7a3d9675511 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -404,22 +404,6 @@ struct cgroup_scanner {
 	void *data;
 };
 
-/*
- * Add a new file to the given cgroup directory. Should only be
- * called by subsystems from within a populate() method
- */
-int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-		       const struct cftype *cft);
-
-/*
- * Add a set of new files to the given cgroup directory. Should
- * only be called by subsystems from within a populate() method
- */
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count);
-
 int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 835ae29e4ea..fb71a930ec8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2615,9 +2615,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-int cgroup_add_file(struct cgroup *cgrp,
-		       struct cgroup_subsys *subsys,
-		       const struct cftype *cft)
+static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			   const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
@@ -2649,22 +2648,23 @@ int cgroup_add_file(struct cgroup *cgrp,
 		error = PTR_ERR(dentry);
 	return error;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_file);
 
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count)
+static int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			    const struct cftype cfts[])
 {
-	int i, err;
-	for (i = 0; i < count; i++) {
-		err = cgroup_add_file(cgrp, subsys, &cft[i]);
-		if (err)
-			return err;
+	const struct cftype *cft;
+	int err, ret = 0;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		err = cgroup_add_file(cgrp, subsys, cft);
+		if (err) {
+			pr_warning("cgroup_add_files: failed to create %s, err=%d\n",
+				   cft->name, err);
+			ret = err;
+		}
 	}
-	return 0;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_files);
 
 static DEFINE_MUTEX(cgroup_cft_mutex);
 
@@ -2688,10 +2688,6 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 {
 	LIST_HEAD(pending);
 	struct cgroup *cgrp, *n;
-	int count = 0;
-
-	while (cfts[count].name[0] != '\0')
-		count++;
 
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
 	if (cfts && ss->root != &rootnode) {
@@ -2713,7 +2709,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (!cgroup_is_removed(cgrp))
-			cgroup_add_files(cgrp, ss, cfts, count);
+			cgroup_add_files(cgrp, ss, cfts);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 
@@ -3739,6 +3735,7 @@ static struct cftype files[] = {
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
+	{ }	/* terminate */
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3746,7 +3743,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
+	err = cgroup_add_files(cgrp, NULL, files);
 	if (err < 0)
 		return err;
 
@@ -3757,16 +3754,8 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 
-		list_for_each_entry(set, &ss->cftsets, node) {
-			const struct cftype *cft;
-
-			for (cft = set->cfts; cft->name[0] != '\0'; cft++) {
-				err = cgroup_add_file(cgrp, ss, cft);
-				if (err)
-					pr_warning("cgroup_populate_dir: failed to create %s, err=%d\n",
-						   cft->name, err);
-			}
-		}
+		list_for_each_entry(set, &ss->cftsets, node)
+			cgroup_add_files(cgrp, ss, set->cfts);
 	}
 
 	/* This cgroup is ready now */
-- 
cgit v1.2.3


From f6ea93723d0049ae5fbbb5292cb10c51d7a80801 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: relocate __d_cgrp() and __d_cft()

Move the two macros upwards as they'll be used earlier in the file.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fb71a930ec8..2ab29eb381b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -282,6 +282,16 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
@@ -1704,16 +1714,6 @@ static struct file_system_type cgroup_fs_type = {
 
 static struct kobject *cgroup_kobj;
 
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
 /**
  * cgroup_path - generate the path of a cgroup
  * @cgrp: the cgroup in question
-- 
cgit v1.2.3


From 05ef1d7c4a5b6d09cadd2b8e9b3c395363d1a89c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: introduce struct cfent

This patch adds cfent (cgroup file entry) which is the association
between a cgroup and a file.  This is in-cgroup representation of
files under a cgroup directory.  This simplifies walking walking
cgroup files and thus cgroup_clear_directory(), which is now
implemented in two parts - cgroup_rm_file() and a loop around it.

cgroup_rm_file() will be used to implement cftype removal and cfent is
scheduled to serve cgroup specific per-file data (e.g. for sysfs-like
"sever" semantics).

v2: - cfe was freed from cgroup_rm_file() which led to use-after-free
      if the file had openers at the time of removal.  Moved to
      cgroup_diput().

    - cgroup_clear_directory() triggered WARN_ON_ONCE() if d_subdirs
      wasn't empty after removing all files.  This triggered
      spuriously if some files were open during directory clearing.
      Removed.

v3: - In cgroup_diput(), WARN_ONCE(!list_empty(&cfe->node)) could be
      spuriously triggered for root cgroups because they don't go
      through cgroup_clear_directory() on unmount.  Don't trigger WARN
      for root cgroups.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Glauber Costa <glommer@parallels.com>
---
 include/linux/cgroup.h |   1 +
 kernel/cgroup.c        | 113 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 78 insertions(+), 36 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7a3d9675511..87b034ed0c3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -175,6 +175,7 @@ struct cgroup {
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
+	struct list_head files;		/* my files */
 
 	struct cgroup *parent;		/* my parent */
 	struct dentry __rcu *dentry;	/* cgroup fs entry, RCU protected */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ab29eb381b..7d5d1c927d9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -147,6 +147,15 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
+ */
+struct cfent {
+	struct list_head		node;
+	struct dentry			*dentry;
+	struct cftype			*type;
+};
+
 /*
  * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
  * cgroup_subsys->use_id != 0.
@@ -287,11 +296,16 @@ static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 	return dentry->d_fsdata;
 }
 
-static inline struct cftype *__d_cft(struct dentry *dentry)
+static inline struct cfent *__d_cfe(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return __d_cfe(dentry)->type;
+}
+
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
@@ -877,6 +891,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		BUG_ON(!list_empty(&cgrp->pidlists));
 
 		kfree_rcu(cgrp, rcu_head);
+	} else {
+		struct cfent *cfe = __d_cfe(dentry);
+		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+
+		WARN_ONCE(!list_empty(&cfe->node) &&
+			  cgrp != &cgrp->root->top_cgroup,
+			  "cfe still linked for %s\n", cfe->type->name);
+		kfree(cfe);
 	}
 	iput(inode);
 }
@@ -895,34 +917,36 @@ static void remove_dir(struct dentry *d)
 	dput(parent);
 }
 
-static void cgroup_clear_directory(struct dentry *dentry)
-{
-	struct list_head *node;
-
-	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-	spin_lock(&dentry->d_lock);
-	node = dentry->d_subdirs.next;
-	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
-
-		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		list_del_init(node);
-		if (d->d_inode) {
-			/* This should never be called on a cgroup
-			 * directory with child cgroups */
-			BUG_ON(d->d_inode->i_mode & S_IFDIR);
-			dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			spin_unlock(&dentry->d_lock);
-			d_delete(d);
-			simple_unlink(dentry->d_inode, d);
-			dput(d);
-			spin_lock(&dentry->d_lock);
-		} else
-			spin_unlock(&d->d_lock);
-		node = dentry->d_subdirs.next;
+static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+	struct cfent *cfe;
+
+	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+	lockdep_assert_held(&cgroup_mutex);
+
+	list_for_each_entry(cfe, &cgrp->files, node) {
+		struct dentry *d = cfe->dentry;
+
+		if (cft && cfe->type != cft)
+			continue;
+
+		dget(d);
+		d_delete(d);
+		simple_unlink(d->d_inode, d);
+		list_del_init(&cfe->node);
+		dput(d);
+
+		return 0;
 	}
-	spin_unlock(&dentry->d_lock);
+	return -ENOENT;
+}
+
+static void cgroup_clear_directory(struct dentry *dir)
+{
+	struct cgroup *cgrp = __d_cgrp(dir);
+
+	while (!list_empty(&cgrp->files))
+		cgroup_rm_file(cgrp, NULL);
 }
 
 /*
@@ -1352,6 +1376,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
+	INIT_LIST_HEAD(&cgrp->files);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
@@ -2619,7 +2644,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			   const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
+	struct cgroup *parent = __d_cgrp(dir);
 	struct dentry *dentry;
+	struct cfent *cfe;
 	int error;
 	umode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
@@ -2635,17 +2662,31 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		strcat(name, ".");
 	}
 	strcat(name, cft->name);
+
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+
+	cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+	if (!cfe)
+		return -ENOMEM;
+
 	dentry = lookup_one_len(name, dir, strlen(name));
-	if (!IS_ERR(dentry)) {
-		mode = cgroup_file_mode(cft);
-		error = cgroup_create_file(dentry, mode | S_IFREG,
-						cgrp->root->sb);
-		if (!error)
-			dentry->d_fsdata = (void *)cft;
-		dput(dentry);
-	} else
+	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
+		goto out;
+	}
+
+	mode = cgroup_file_mode(cft);
+	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
+	if (!error) {
+		cfe->type = (void *)cft;
+		cfe->dentry = dentry;
+		dentry->d_fsdata = cfe;
+		list_add_tail(&cfe->node, &parent->files);
+		cfe = NULL;
+	}
+	dput(dentry);
+out:
+	kfree(cfe);
 	return error;
 }
 
-- 
cgit v1.2.3


From 79578621b4847afdef48d19a28d00e3b188c37e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: implement cgroup_rm_cftypes()

Implement cgroup_rm_cftypes() which removes an array of cftypes from a
subsystem.  It can be called whether the target subsys is attached or
not.  cgroup core will remove the specified file from all existing
cgroups.

This will be used to improve sub-subsys modularity and will be helpful
for unified hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 54 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 87b034ed0c3..028478c6e0c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -406,6 +406,7 @@ struct cgroup_scanner {
 };
 
 int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7d5d1c927d9..21bba772235 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2690,17 +2690,20 @@ out:
 	return error;
 }
 
-static int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-			    const struct cftype cfts[])
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			      const struct cftype cfts[], bool is_add)
 {
 	const struct cftype *cft;
 	int err, ret = 0;
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
-		err = cgroup_add_file(cgrp, subsys, cft);
+		if (is_add)
+			err = cgroup_add_file(cgrp, subsys, cft);
+		else
+			err = cgroup_rm_file(cgrp, cft);
 		if (err) {
-			pr_warning("cgroup_add_files: failed to create %s, err=%d\n",
-				   cft->name, err);
+			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
+				   is_add ? "add" : "remove", cft->name, err);
 			ret = err;
 		}
 	}
@@ -2724,7 +2727,7 @@ static void cgroup_cfts_prepare(void)
 }
 
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-			       const struct cftype *cfts)
+			       const struct cftype *cfts, bool is_add)
 	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
 {
 	LIST_HEAD(pending);
@@ -2750,7 +2753,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (!cgroup_is_removed(cgrp))
-			cgroup_add_files(cgrp, ss, cfts);
+			cgroup_addrm_files(cgrp, ss, cfts, is_add);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 
@@ -2786,12 +2789,43 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
 	cgroup_cfts_prepare();
 	set->cfts = cfts;
 	list_add_tail(&set->node, &ss->cftsets);
-	cgroup_cfts_commit(ss, cfts);
+	cgroup_cfts_commit(ss, cfts, true);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts from @ss.  Files described by @cfts are removed from
+ * all existing cgroups to which @ss is attached and all future cgroups
+ * won't have them either.  This function can be called anytime whether @ss
+ * is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered with @ss.
+ */
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+	struct cftype_set *set;
+
+	cgroup_cfts_prepare();
+
+	list_for_each_entry(set, &ss->cftsets, node) {
+		if (set->cfts == cfts) {
+			list_del_init(&set->node);
+			cgroup_cfts_commit(ss, cfts, false);
+			return 0;
+		}
+	}
+
+	cgroup_cfts_commit(ss, NULL, false);
+	return -ENOENT;
+}
+
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
@@ -3784,7 +3818,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	err = cgroup_add_files(cgrp, NULL, files);
+	err = cgroup_addrm_files(cgrp, NULL, files, true);
 	if (err < 0)
 		return err;
 
@@ -3796,7 +3830,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 			return err;
 
 		list_for_each_entry(set, &ss->cftsets, node)
-			cgroup_add_files(cgrp, ss, set->cfts);
+			cgroup_addrm_files(cgrp, ss, set->cfts, true);
 	}
 
 	/* This cgroup is ready now */
-- 
cgit v1.2.3


From 28b4c27b8e6bb6d7ff2875281a8484f8898a87ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: use negative bias on css->refcnt to block css_tryget()

When a cgroup is about to be removed, cgroup_clear_css_refs() is
called to check and ensure that there are no active css references.

This is currently achieved by dropping the refcnt to zero iff it has
only the base ref.  If all css refs could be dropped to zero, ref
clearing is successful and CSS_REMOVED is set on all css.  If not, the
base ref is restored.  While css ref is zero w/o CSS_REMOVED set, any
css_tryget() attempt on it busy loops so that they are atomic
w.r.t. the whole css ref clearing.

This does work but dropping and re-instating the base ref is somewhat
hairy and makes it difficult to add more logic to the put path as
there are two of them - the regular css_put() and the reversible base
ref clearing.

This patch updates css ref clearing such that blocking new
css_tryget() and putting the base ref are separate operations.
CSS_DEACT_BIAS, defined as INT_MIN, is added to css->refcnt and
css_tryget() busy loops while refcnt is negative.  After all css refs
are deactivated, if they were all one, ref clearing succeeded and
CSS_REMOVED is set and the base ref is put using the regular
css_put(); otherwise, CSS_DEACT_BIAS is subtracted from the refcnts
and the original postive values are restored.

css_refcnt() accessor which always returns the unbiased positive
reference counts is added and used to simplify refcnt usages.  While
at it, relocate and reformat comments in cgroup_has_css_refs().

This separates css->refcnt deactivation and putting the base ref,
which enables the next patch to make ref clearing optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  12 ++---
 kernel/cgroup.c        | 119 +++++++++++++++++++++++++++++--------------------
 2 files changed, 75 insertions(+), 56 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 028478c6e0c..be81fafae11 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -115,16 +115,12 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
  * the css has been destroyed.
  */
 
+extern bool __css_tryget(struct cgroup_subsys_state *css);
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (test_bit(CSS_ROOT, &css->flags))
 		return true;
-	while (!atomic_inc_not_zero(&css->refcnt)) {
-		if (test_bit(CSS_REMOVED, &css->flags))
-			return false;
-		cpu_relax();
-	}
-	return true;
+	return __css_tryget(css);
 }
 
 /*
@@ -132,11 +128,11 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  * css_get() or css_tryget()
  */
 
-extern void __css_put(struct cgroup_subsys_state *css, int count);
+extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!test_bit(CSS_ROOT, &css->flags))
-		__css_put(css, 1);
+		__css_put(css);
 }
 
 /* bits in struct cgroup flags field */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 21bba772235..2eade518660 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,6 +63,9 @@
 
 #include <linux/atomic.h>
 
+/* css deactivation bias, makes css->refcnt negative to deny new trygets */
+#define CSS_DEACT_BIAS		INT_MIN
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -251,6 +254,14 @@ int cgroup_lock_is_held(void)
 
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 
+/* the current nr of refs, always >= 0 whether @css is deactivated or not */
+static int css_refcnt(struct cgroup_subsys_state *css)
+{
+	int v = atomic_read(&css->refcnt);
+
+	return v >= 0 ? v : v - CSS_DEACT_BIAS;
+}
+
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -4006,18 +4017,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+/*
+ * Check the reference count on each subsystem. Since we already
+ * established that there are no tasks in the cgroup, if the css refcount
+ * is also 1, then there should be no outstanding references, so the
+ * subsystem is safe to destroy. We scan across all subsystems rather than
+ * using the per-hierarchy linked list of mounted subsystems since we can
+ * be called via check_for_release() with no synchronization other than
+ * RCU, and the subsystem linked list isn't RCU-safe.
+ */
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
-	/* Check the reference count on each subsystem. Since we
-	 * already established that there are no tasks in the
-	 * cgroup, if the css refcount is also 1, then there should
-	 * be no outstanding references, so the subsystem is safe to
-	 * destroy. We scan across all subsystems rather than using
-	 * the per-hierarchy linked list of mounted subsystems since
-	 * we can be called via check_for_release() with no
-	 * synchronization other than RCU, and the subsystem linked
-	 * list isn't RCU-safe */
 	int i;
+
 	/*
 	 * We won't need to lock the subsys array, because the subsystems
 	 * we're concerned about aren't going anywhere since our cgroup root
@@ -4026,17 +4038,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		struct cgroup_subsys_state *css;
+
 		/* Skip subsystems not present or not in this hierarchy */
 		if (ss == NULL || ss->root != cgrp->root)
 			continue;
+
 		css = cgrp->subsys[ss->subsys_id];
-		/* When called from check_for_release() it's possible
+		/*
+		 * When called from check_for_release() it's possible
 		 * that by this point the cgroup has been removed
 		 * and the css deleted. But a false-positive doesn't
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
-		 * release agent to be called anyway. */
-		if (css && (atomic_read(&css->refcnt) > 1))
+		 * release agent to be called anyway.
+		 */
+		if (css && css_refcnt(css) > 1)
 			return 1;
 	}
 	return 0;
@@ -4053,44 +4069,37 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	unsigned long flags;
 	bool failed = false;
+
 	local_irq_save(flags);
+
+	/*
+	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
+	 * were 1 at the moment of deactivation, we succeeded.
+	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-		int refcnt;
-		while (1) {
-			/* We can only remove a CSS with a refcnt==1 */
-			refcnt = atomic_read(&css->refcnt);
-			if (refcnt > 1) {
-				failed = true;
-				goto done;
-			}
-			BUG_ON(!refcnt);
-			/*
-			 * Drop the refcnt to 0 while we check other
-			 * subsystems. This will cause any racing
-			 * css_tryget() to spin until we set the
-			 * CSS_REMOVED bits or abort
-			 */
-			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
-				break;
-			cpu_relax();
-		}
+
+		WARN_ON(atomic_read(&css->refcnt) < 0);
+		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+		failed |= css_refcnt(css) != 1;
 	}
- done:
+
+	/*
+	 * If succeeded, set REMOVED and put all the base refs; otherwise,
+	 * restore refcnts to positive values.  Either way, all in-progress
+	 * css_tryget() will be released.
+	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-		if (failed) {
-			/*
-			 * Restore old refcnt if we previously managed
-			 * to clear it from 1 to 0
-			 */
-			if (!atomic_read(&css->refcnt))
-				atomic_set(&css->refcnt, 1);
-		} else {
-			/* Commit the fact that the CSS is removed */
+
+		if (!failed) {
 			set_bit(CSS_REMOVED, &css->flags);
+			css_put(css);
+		} else {
+			atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
 		}
 	}
+
 	local_irq_restore(flags);
 	return !failed;
 }
@@ -4887,13 +4896,28 @@ static void check_for_release(struct cgroup *cgrp)
 }
 
 /* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+	do {
+		int v = css_refcnt(css);
+
+		if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+			return true;
+		cpu_relax();
+	} while (!test_bit(CSS_REMOVED, &css->flags));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(__css_tryget);
+
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
-	int val;
+
 	rcu_read_lock();
-	val = atomic_sub_return(count, &css->refcnt);
-	if (val == 1) {
+	atomic_dec(&css->refcnt);
+	if (css_refcnt(css) == 1) {
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -4901,7 +4925,6 @@ void __css_put(struct cgroup_subsys_state *css, int count)
 		cgroup_wakeup_rmdir_waiter(cgrp);
 	}
 	rcu_read_unlock();
-	WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
 
@@ -5020,7 +5043,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+	cssid = rcu_dereference_check(css->id, css_refcnt(css));
 
 	if (cssid)
 		return cssid->id;
@@ -5032,7 +5055,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
 
-	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+	cssid = rcu_dereference_check(css->id, css_refcnt(css));
 
 	if (cssid)
 		return cssid->depth;
-- 
cgit v1.2.3


From 48ddbe194623ae089cc0576e60363f2d2e85662a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: make css->refcnt clearing on cgroup removal optional

Currently, cgroup removal tries to drain all css references.  If there
are active css references, the removal logic waits and retries
->pre_detroy() until either all refs drop to zero or removal is
cancelled.

This semantics is unusual and adds non-trivial complexity to cgroup
core and IMHO is fundamentally misguided in that it couples internal
implementation details (references to internal data structure) with
externally visible operation (rmdir).  To userland, this is a behavior
peculiarity which is unnecessary and difficult to expect (css refs is
otherwise invisible from userland), and, to policy implementations,
this is an unnecessary restriction (e.g. blkcg wants to hold css refs
for caching purposes but can't as that becomes visible as rmdir hang).

Unfortunately, memcg currently depends on ->pre_destroy() retrials and
cgroup removal vetoing and can't be immmediately switched to the new
behavior.  This patch introduces the new behavior of not waiting for
css refs to drain and maintains the old behavior for subsystems which
have __DEPRECATED_clear_css_refs set.

Once, memcg is updated, we can drop the code paths for the old
behavior as proposed in the following patch.  Note that the following
patch is incorrect in that dput work item is in cgroup and may lose
some of dputs when multiples css's are released back-to-back, and
__css_put() triggers check_for_release() when refcnt reaches 0 instead
of 1; however, it shows what part can be removed.

  http://thread.gmane.org/gmane.linux.kernel.containers/22559/focus=75251

Note that, in not-too-distant future, cgroup core will start emitting
warning messages for subsys which require the old behavior, so please
get moving.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 17 ++++++++++++
 kernel/cgroup.c        | 71 +++++++++++++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c        |  1 +
 3 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index be81fafae11..565c8034e6c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -16,6 +16,7 @@
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -76,12 +77,16 @@ struct cgroup_subsys_state {
 	unsigned long flags;
 	/* ID for this css, if possible */
 	struct css_id __rcu *id;
+
+	/* Used to put @cgroup->dentry on the last css_put() */
+	struct work_struct dput_work;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT, /* This CSS is the root of the subsystem */
 	CSS_REMOVED, /* This CSS is dead */
+	CSS_CLEAR_CSS_REFS,		/* @ss->__DEPRECATED_clear_css_refs */
 };
 
 /* Caller must verify that the css is not for root cgroup */
@@ -480,6 +485,18 @@ struct cgroup_subsys {
 	 * (not available in early_init time.)
 	 */
 	bool use_id;
+
+	/*
+	 * If %true, cgroup removal will try to clear css refs by retrying
+	 * ss->pre_destroy() until there's no css ref left.  This behavior
+	 * is strictly for backward compatibility and will be removed as
+	 * soon as the current user (memcg) is updated.
+	 *
+	 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
+	 * wait for css refs to drop to zero before proceeding.
+	 */
+	bool __DEPRECATED_clear_css_refs;
+
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2eade518660..2905977e0f3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -854,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	int ret = 0;
 
-	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy) {
-			ret = ss->pre_destroy(cgrp);
-			if (ret)
-				break;
+	for_each_subsys(cgrp->root, ss) {
+		if (!ss->pre_destroy)
+			continue;
+
+		ret = ss->pre_destroy(cgrp);
+		if (ret) {
+			/* ->pre_destroy() failure is being deprecated */
+			WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
+			break;
 		}
+	}
 
 	return ret;
 }
@@ -3859,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	return 0;
 }
 
+static void css_dput_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, dput_work);
+
+	dput(css->cgroup->dentry);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
@@ -3871,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
 	cgrp->subsys[ss->subsys_id] = css;
+
+	/*
+	 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+	 * which is put on the last css_put().  dput() requires process
+	 * context, which css_put() may be called without.  @css->dput_work
+	 * will be used to invoke dput() asynchronously from css_put().
+	 */
+	INIT_WORK(&css->dput_work, css_dput_fn);
+	if (ss->__DEPRECATED_clear_css_refs)
+		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
 
 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3973,6 +3996,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (err < 0)
 		goto err_remove;
 
+	/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+	for_each_subsys(root, ss)
+		if (!ss->__DEPRECATED_clear_css_refs)
+			dget(dentry);
+
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
@@ -4062,8 +4090,24 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
  * Atomically mark all (or else none) of the cgroup's CSS objects as
  * CSS_REMOVED. Return true on success, or false if the cgroup has
  * busy subsystems. Call with cgroup_mutex held
+ *
+ * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
+ * not, cgroup removal behaves differently.
+ *
+ * If clear is set, css refcnt for the subsystem should be zero before
+ * cgroup removal can be committed.  This is implemented by
+ * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
+ * called multiple times until all css refcnts reach zero and is allowed to
+ * veto removal on any invocation.  This behavior is deprecated and will be
+ * removed as soon as the existing user (memcg) is updated.
+ *
+ * If clear is not set, each css holds an extra reference to the cgroup's
+ * dentry and cgroup removal proceeds regardless of css refs.
+ * ->pre_destroy() will be called at least once and is not allowed to fail.
+ * On the last put of each css, whenever that may be, the extra dentry ref
+ * is put so that dentry destruction happens only after all css's are
+ * released.
  */
-
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
@@ -4074,14 +4118,17 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 
 	/*
 	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
-	 * were 1 at the moment of deactivation, we succeeded.
+	 * for subsystems w/ clear_css_refs set were 1 at the moment of
+	 * deactivation, we succeeded.
 	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
 		WARN_ON(atomic_read(&css->refcnt) < 0);
 		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-		failed |= css_refcnt(css) != 1;
+
+		if (ss->__DEPRECATED_clear_css_refs)
+			failed |= css_refcnt(css) != 1;
 	}
 
 	/*
@@ -4917,12 +4964,18 @@ void __css_put(struct cgroup_subsys_state *css)
 
 	rcu_read_lock();
 	atomic_dec(&css->refcnt);
-	if (css_refcnt(css) == 1) {
+	switch (css_refcnt(css)) {
+	case 1:
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		cgroup_wakeup_rmdir_waiter(cgrp);
+		break;
+	case 0:
+		if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+			schedule_work(&css->dput_work);
+		break;
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bef114258bb..d28359cd6b5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5635,6 +5635,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
+	.__DEPRECATED_clear_css_refs = true,
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-- 
cgit v1.2.3


From 1d62e43657c63a858560c98069706c705d20505d Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Mon, 9 Apr 2012 19:36:33 -0300
Subject: cgroup: pass struct mem_cgroup instead of struct cgroup to socket
 memcg

The only reason cgroup was used, was to be consistent with the populate()
interface. Now that we're getting rid of it, not only we no longer need
it, but we also *can't* call it this way.

Since we will no longer rely on populate(), this will be called from
create(). During create, the association between struct mem_cgroup
and struct cgroup does not yet exist, since cgroup internals hasn't
yet initialized its bookkeeping. This means we would not be able
to draw the memcg pointer from the cgroup pointer in these
functions, which is highly undesirable.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 include/net/sock.h           | 12 ++++++------
 include/net/tcp_memcontrol.h |  4 ++--
 mm/memcontrol.c              | 24 +++++++++---------------
 net/core/sock.c              | 10 +++++-----
 net/ipv4/tcp_memcontrol.c    |  6 ++----
 5 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index a6ba1f8871f..b3ebe6b3e7d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -70,16 +70,16 @@
 struct cgroup;
 struct cgroup_subsys;
 #ifdef CONFIG_NET
-int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss);
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp);
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg);
 #else
 static inline
-int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 static inline
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -900,9 +900,9 @@ struct proto {
 	 * This function has to setup any files the protocol want to
 	 * appear in the kmem cgroup filesystem.
 	 */
-	int			(*init_cgroup)(struct cgroup *cgrp,
+	int			(*init_cgroup)(struct mem_cgroup *memcg,
 					       struct cgroup_subsys *ss);
-	void			(*destroy_cgroup)(struct cgroup *cgrp);
+	void			(*destroy_cgroup)(struct mem_cgroup *memcg);
 	struct cg_proto		*(*proto_cgroup)(struct mem_cgroup *memcg);
 #endif
 };
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 48410ff25c9..7df18bc43a9 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -12,8 +12,8 @@ struct tcp_memcontrol {
 };
 
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
-int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
-void tcp_destroy_cgroup(struct cgroup *cgrp);
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
+void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
 #endif /* _TCP_MEMCG_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d28359cd6b5..785c3236707 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4640,29 +4640,22 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+static int register_kmem_files(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
-	/*
-	 * Part of this would be better living in a separate allocation
-	 * function, leaving us with just the cgroup tree population work.
-	 * We, however, depend on state such as network's proto_list that
-	 * is only initialized after cgroup creation. I found the less
-	 * cumbersome way to deal with it to defer it all to populate time
-	 */
-	return mem_cgroup_sockets_init(cont, ss);
+	return mem_cgroup_sockets_init(memcg, ss);
 };
 
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
-	mem_cgroup_sockets_destroy(cont);
+	mem_cgroup_sockets_destroy(memcg);
 }
 #else
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+static int register_kmem_files(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -5034,7 +5027,7 @@ static void mem_cgroup_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	kmem_cgroup_destroy(cont);
+	kmem_cgroup_destroy(memcg);
 
 	mem_cgroup_put(memcg);
 }
@@ -5042,7 +5035,8 @@ static void mem_cgroup_destroy(struct cgroup *cont)
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	return register_kmem_files(cont, ss);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	return register_kmem_files(memcg, ss);
 }
 
 #ifdef CONFIG_MMU
diff --git a/net/core/sock.c b/net/core/sock.c
index b2e14c07d92..878f7447cf6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -140,7 +140,7 @@ static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
-int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	struct proto *proto;
 	int ret = 0;
@@ -148,7 +148,7 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	mutex_lock(&proto_list_mutex);
 	list_for_each_entry(proto, &proto_list, node) {
 		if (proto->init_cgroup) {
-			ret = proto->init_cgroup(cgrp, ss);
+			ret = proto->init_cgroup(memcg, ss);
 			if (ret)
 				goto out;
 		}
@@ -159,19 +159,19 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 out:
 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
 		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(cgrp);
+			proto->destroy_cgroup(memcg);
 	mutex_unlock(&proto_list_mutex);
 	return ret;
 }
 
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 {
 	struct proto *proto;
 
 	mutex_lock(&proto_list_mutex);
 	list_for_each_entry_reverse(proto, &proto_list, node)
 		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(cgrp);
+			proto->destroy_cgroup(memcg);
 	mutex_unlock(&proto_list_mutex);
 }
 #endif
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 8f1753defa5..151703791bb 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -18,7 +18,7 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
-int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
 	 * The root cgroup does not use res_counters, but rather,
@@ -28,7 +28,6 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	struct res_counter *res_parent = NULL;
 	struct cg_proto *cg_proto, *parent_cg;
 	struct tcp_memcontrol *tcp;
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 	struct net *net = current->nsproxy->net_ns;
 
@@ -61,9 +60,8 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 }
 EXPORT_SYMBOL(tcp_init_cgroup);
 
-void tcp_destroy_cgroup(struct cgroup *cgrp)
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct cg_proto *cg_proto;
 	struct tcp_memcontrol *tcp;
 	u64 val;
-- 
cgit v1.2.3


From cbe128e348e5994516304f94865ff90c40c1c5ae Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Mon, 9 Apr 2012 19:36:34 -0300
Subject: cgroup: get rid of populate for memcg

The last man standing justifying the need for populate() is the
sock memcg initialization functions. Now that we are able to pass
a struct mem_cgroup instead of a struct cgroup to the socket
initialization, there is nothing that stops us from initializing
everything in create().

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 mm/memcontrol.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 785c3236707..901bb03f2ae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4640,7 +4640,7 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
-static int register_kmem_files(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return mem_cgroup_sockets_init(memcg, ss);
 };
@@ -4650,7 +4650,7 @@ static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 	mem_cgroup_sockets_destroy(memcg);
 }
 #else
-static int register_kmem_files(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
@@ -5010,6 +5010,17 @@ mem_cgroup_create(struct cgroup *cont)
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
+
+	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+	if (error) {
+		/*
+		 * We call put now because our (and parent's) refcnts
+		 * are already in place. mem_cgroup_put() will internally
+		 * call __mem_cgroup_free, so return directly
+		 */
+		mem_cgroup_put(memcg);
+		return ERR_PTR(error);
+	}
 	return &memcg->css;
 free_out:
 	__mem_cgroup_free(memcg);
@@ -5032,13 +5043,6 @@ static void mem_cgroup_destroy(struct cgroup *cont)
 	mem_cgroup_put(memcg);
 }
 
-static int mem_cgroup_populate(struct cgroup_subsys *ss,
-				struct cgroup *cont)
-{
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	return register_kmem_files(memcg, ss);
-}
-
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
@@ -5622,7 +5626,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
-	.populate = mem_cgroup_populate,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
-- 
cgit v1.2.3


From 86f82d561864e902c70282b6f17cf590c0f34691 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Apr 2012 10:16:36 -0700
Subject: cgroup: remove cgroup_subsys->populate()

With memcg converted, cgroup_subsys->populate() doesn't have any user
left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 565c8034e6c..d3f5fba2c15 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -472,7 +472,6 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
-	int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup *cgrp);
 	void (*bind)(struct cgroup *root);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2905977e0f3..b2f203f25ec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3842,9 +3842,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	for_each_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
 
-		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
-			return err;
-
 		list_for_each_entry(set, &ss->cftsets, node)
 			cgroup_addrm_files(cgrp, ss, set->cfts, true);
 	}
-- 
cgit v1.2.3


From c4c27fbdda4e8ba87806c415b6d15266b07bce4b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Sat, 21 Apr 2012 09:13:46 +0200
Subject: cgroups: disallow attaching kthreadd or PF_THREAD_BOUND threads

Allowing kthreadd to be moved to a non-root group makes no sense, it being
a global resource, and needlessly leads unsuspecting users toward trouble.

1. An RT workqueue worker thread spawned in a task group with no rt_runtime
allocated is not schedulable.  Simple user error, but harmful to the box.

2. A worker thread which acquires PF_THREAD_BOUND can never leave a cpuset,
rendering the cpuset immortal.

Save the user some unexpected trouble, just say no.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b2f203f25ec..ad8eae5bb80 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
 
 #include <linux/atomic.h>
 
@@ -2225,6 +2226,18 @@ retry_find_task:
 
 	if (threadgroup)
 		tsk = tsk->group_leader;
+
+	/*
+	 * Workqueue threads may acquire PF_THREAD_BOUND and become
+	 * trapped in a cpuset, or RT worker may be born in a cgroup
+	 * with no rt_runtime allocated.  Just say no.
+	 */
+	if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+		ret = -EINVAL;
+		rcu_read_unlock();
+		goto out_unlock_cgroup;
+	}
+
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 4d8438f044d8aaac6fbba98316ba484dabea397d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Apr 2012 01:11:35 +0200
Subject: res_counter: Merge res_counter_charge and res_counter_charge_nofail

These two functions do almost the same thing and duplicate some code.
Merge their implementation into a single common function.
res_counter_charge_locked() takes one more parameter but it doesn't seem
to be used outside res_counter.c yet anyway.

There is no (intended) change in the behaviour.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Li Zefan <lizefan@huawei.com>
---
 Documentation/cgroups/resource_counter.txt |  4 +-
 include/linux/res_counter.h                |  2 +-
 kernel/res_counter.c                       | 73 ++++++++++++++----------------
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index 95b24d766ea..f3c4ec3626a 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -77,11 +77,11 @@ to work with it.
 	where the charging failed.
 
  d. int res_counter_charge_locked
-			(struct res_counter *rc, unsigned long val)
+			(struct res_counter *rc, unsigned long val, bool force)
 
 	The same as res_counter_charge(), but it must not acquire/release the
 	res_counter->lock internally (it must be called with res_counter->lock
-	held).
+	held). The force parameter indicates whether we can bypass the limit.
 
  e. void res_counter_uncharge[_locked]
 			(struct res_counter *rc, unsigned long val)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index da81af086ea..fb201896a8b 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -116,7 +116,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  */
 
 int __must_check res_counter_charge_locked(struct res_counter *counter,
-		unsigned long val);
+					   unsigned long val, bool force);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 int __must_check res_counter_charge_nofail(struct res_counter *counter,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b..07a29923aba 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 	counter->parent = parent;
 }
 
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
+			      bool force)
 {
+	int ret = 0;
+
 	if (counter->usage + val > counter->limit) {
 		counter->failcnt++;
-		return -ENOMEM;
+		ret = -ENOMEM;
+		if (!force)
+			return ret;
 	}
 
 	counter->usage += val;
-	if (counter->usage > counter->max_usage)
+	if (!force && counter->usage > counter->max_usage)
 		counter->max_usage = counter->usage;
-	return 0;
+	return ret;
 }
 
-int res_counter_charge(struct res_counter *counter, unsigned long val,
-			struct res_counter **limit_fail_at)
+static int __res_counter_charge(struct res_counter *counter, unsigned long val,
+				struct res_counter **limit_fail_at, bool force)
 {
-	int ret;
+	int ret, r;
 	unsigned long flags;
 	struct res_counter *c, *u;
 
+	r = ret = 0;
 	*limit_fail_at = NULL;
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
-		ret = res_counter_charge_locked(c, val);
+		r = res_counter_charge_locked(c, val, force);
 		spin_unlock(&c->lock);
-		if (ret < 0) {
+		if (r < 0 && !ret) {
+			ret = r;
 			*limit_fail_at = c;
-			goto undo;
+			if (!force)
+				break;
 		}
 	}
-	ret = 0;
-	goto done;
-undo:
-	for (u = counter; u != c; u = u->parent) {
-		spin_lock(&u->lock);
-		res_counter_uncharge_locked(u, val);
-		spin_unlock(&u->lock);
+
+	if (ret < 0 && !force) {
+		for (u = counter; u != c; u = u->parent) {
+			spin_lock(&u->lock);
+			res_counter_uncharge_locked(u, val);
+			spin_unlock(&u->lock);
+		}
 	}
-done:
 	local_irq_restore(flags);
+
 	return ret;
 }
 
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+			struct res_counter **limit_fail_at)
+{
+	return __res_counter_charge(counter, val, limit_fail_at, false);
+}
+
 int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 			      struct res_counter **limit_fail_at)
 {
-	int ret, r;
-	unsigned long flags;
-	struct res_counter *c;
-
-	r = ret = 0;
-	*limit_fail_at = NULL;
-	local_irq_save(flags);
-	for (c = counter; c != NULL; c = c->parent) {
-		spin_lock(&c->lock);
-		r = res_counter_charge_locked(c, val);
-		if (r)
-			c->usage += val;
-		spin_unlock(&c->lock);
-		if (r < 0 && ret == 0) {
-			*limit_fail_at = c;
-			ret = r;
-		}
-	}
-	local_irq_restore(flags);
-
-	return ret;
+	return __res_counter_charge(counter, val, limit_fail_at, true);
 }
+
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
 	if (WARN_ON(counter->usage < val))
-- 
cgit v1.2.3


From 0d4dde1ac9a5af74ac76c6ab90557d1ae7b8f5d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Apr 2012 01:11:36 +0200
Subject: res_counter: Account max_usage when calling
 res_counter_charge_nofail()

Updating max_usage is something one would expect when we reach
a new maximum usage value even when we do this by forcing through
the limit with res_counter_charge_nofail().

(Whether we want to account failcnt when we force through the limit
is another debate).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Li Zefan <lizefan@huawei.com>
---
 kernel/res_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 07a29923aba..bebe2b170d4 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -35,7 +35,7 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
 	}
 
 	counter->usage += val;
-	if (!force && counter->usage > counter->max_usage)
+	if (counter->usage > counter->max_usage)
 		counter->max_usage = counter->usage;
 	return ret;
 }
-- 
cgit v1.2.3