From 96aa1b419d47286db446f292cf898bb1a8b27f24 Mon Sep 17 00:00:00 2001
From: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Date: Mon, 23 Aug 2010 10:56:30 +0200
Subject: blkio: Fix return code for mkdir calls

If the cgroup hierarchy for blkio control groups is deeper than two
levels, kernel should not allow the creation of further levels. mkdir
system call does not except EINVAL as a return value. This patch
replaces EINVAL with more appropriate EPERM

Signed-off-by: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d21..2fef1ef931a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -966,7 +966,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 
 	/* Currently we do not support hierarchy deeper than two level (0,1) */
 	if (parent != cgroup->top_cgroup)
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EPERM);
 
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
-- 
cgit v1.2.3


From b6508c1618e7aab085f191efb41b7b019a94ea38 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 Aug 2010 12:23:33 +0200
Subject: cfq-iosched: Do not idle if slice_idle=0

Do not idle either on cfq queue or service tree if slice_idle=0. User does
not want any queue or service tree idling. Currently even if slice_idle=0,
we were waiting for request to finish before expiring the queue and that
can lead to lower queue depths.

Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfe..8830569542c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1839,6 +1839,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 
+	if (!cfqd->cfq_slice_idle)
+		return false;
+
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
 		return false;
@@ -1879,7 +1882,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
+	if (!cfq_should_idle(cfqd, cfqq))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 02b35081fc98f681411586d3acf9eaad8b8f6e07 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 Aug 2010 12:23:53 +0200
Subject: cfq-iosched: Do group share accounting in IOPS when slice_idle=0

o Implement another CFQ mode where we charge group in terms of number
  of requests dispatched instead of measuring the time. Measuring in terms
  of time is not possible when we are driving deeper queue depths and there
  are requests from multiple cfq queues in the request queue.

o This mode currently gets activated if one sets slice_idle=0 and associated
  disk supports NCQ. Again the idea is that on an NCQ disk with idling disabled
  most of the queues will dispatch 1 or more requests and then cfq queue
  expiry happens and we don't have a way to measure time. So start providing
  fairness in terms of IOPS.

o Currently IOPS mode works only with cfq group scheduling. CFQ is following
  different scheduling algorithms for queue and group scheduling. These IOPS
  stats are used only for group scheduling hence in non-croup mode nothing
  should change.

o For CFQ group scheduling one can disable slice idling so that we don't idle
  on queue and drive deeper request queue depths (achieving better throughput),
  at the same time group idle is enabled so one should get service
  differentiation among groups.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8830569542c..3fc6be110c1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -378,6 +378,21 @@ CFQ_CFQQ_FNS(wait_busy);
 			&cfqg->service_trees[i][j]: NULL) \
 
 
+static inline bool iops_mode(struct cfq_data *cfqd)
+{
+	/*
+	 * If we are not idling on queues and it is a NCQ drive, parallel
+	 * execution of requests is on and measuring time is not possible
+	 * in most of the cases until and unless we drive shallower queue
+	 * depths and that becomes a performance bottleneck. In such cases
+	 * switch to start providing fairness in terms of number of IOs.
+	 */
+	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
+		return true;
+	else
+		return false;
+}
+
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
@@ -906,7 +921,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 			slice_used = cfqq->allocated_slice;
 	}
 
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
 	return slice_used;
 }
 
@@ -914,19 +928,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	unsigned int used_sl, charge_sl;
+	unsigned int used_sl, charge;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 
 	BUG_ON(nr_sync < 0);
-	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
 
-	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
-		charge_sl = cfqq->allocated_slice;
+	if (iops_mode(cfqd))
+		charge = cfqq->slice_dispatch;
+	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -940,6 +956,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u",
+			used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd));
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
-- 
cgit v1.2.3


From 80bdf0c78fff075dfa21576273c8b0c7db22bdfe Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 Aug 2010 12:24:26 +0200
Subject: cfq-iosched: Implement tunable group_idle

o Implement a new tunable group_idle, which allows idling on the group
  instead of a cfq queue. Hence one can set slice_idle = 0 and not idle
  on the individual queues but idle on the group. This way on fast storage
  we can get fairness between groups at the same time overall throughput
  improves.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3fc6be110c1..85e48192754 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
+static int cfq_group_idle = HZ / 125;
 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 
@@ -198,6 +199,8 @@ struct cfq_group {
 	struct hlist_node cfqd_node;
 	atomic_t ref;
 #endif
+	/* number of requests that are on the dispatch list or inside driver */
+	int dispatched;
 };
 
 /*
@@ -271,6 +274,7 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
+	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
 	unsigned int cfq_group_isolation;
 
@@ -1884,7 +1888,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_context *cic;
-	unsigned long sl;
+	unsigned long sl, group_idle = 0;
 
 	/*
 	 * SSD device without seek penalty, disable idling. But only do so
@@ -1900,8 +1904,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!cfq_should_idle(cfqd, cfqq))
-		return;
+	if (!cfq_should_idle(cfqd, cfqq)) {
+		/* no queue idling. Check for group idling */
+		if (cfqd->cfq_group_idle)
+			group_idle = cfqd->cfq_group_idle;
+		else
+			return;
+	}
 
 	/*
 	 * still active requests from this queue, don't idle
@@ -1928,13 +1937,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		return;
 	}
 
+	/* There are other queues in the group, don't do group idle */
+	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+		return;
+
 	cfq_mark_cfqq_wait_request(cfqq);
 
-	sl = cfqd->cfq_slice_idle;
+	if (group_idle)
+		sl = cfqd->cfq_group_idle;
+	else
+		sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
-	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+			group_idle ? 1 : 0);
 }
 
 /*
@@ -1950,6 +1967,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
+	(RQ_CFQG(rq))->dispatched++;
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
@@ -2219,7 +2237,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 			cfqq = NULL;
 			goto keep_queue;
 		} else
-			goto expire;
+			goto check_group_idle;
 	}
 
 	/*
@@ -2247,8 +2265,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
-	if (timer_pending(&cfqd->idle_slice_timer) ||
-	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+	if (timer_pending(&cfqd->idle_slice_timer)) {
+		cfqq = NULL;
+		goto keep_queue;
+	}
+
+	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+		cfqq = NULL;
+		goto keep_queue;
+	}
+
+	/*
+	 * If group idle is enabled and there are requests dispatched from
+	 * this group, wait for requests to complete.
+	 */
+check_group_idle:
+	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+	    && cfqq->cfqg->dispatched) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
@@ -3396,6 +3429,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
+	(RQ_CFQG(rq))->dispatched--;
 	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
 			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
 			rq_data_dir(rq), rq_is_sync(rq));
@@ -3425,7 +3459,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		 * the queue.
 		 */
 		if (cfq_should_wait_busy(cfqd, cfqq)) {
-			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+			unsigned long extend_sl = cfqd->cfq_slice_idle;
+			if (!cfqd->cfq_slice_idle)
+				extend_sl = cfqd->cfq_group_idle;
+			cfqq->slice_end = jiffies + extend_sl;
 			cfq_mark_cfqq_wait_busy(cfqq);
 			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
 		}
@@ -3871,6 +3908,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
@@ -3943,6 +3981,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3975,6 +4014,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3996,6 +4036,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
+	CFQ_ATTR(group_idle),
 	CFQ_ATTR(low_latency),
 	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
@@ -4049,6 +4090,12 @@ static int __init cfq_init(void)
 	if (!cfq_slice_idle)
 		cfq_slice_idle = 1;
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	if (!cfq_group_idle)
+		cfq_group_idle = 1;
+#else
+		cfq_group_idle = 0;
+#endif
 	if (cfq_slab_setup())
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From c4e7893ebc3a5c507b53f59b9de448db20849944 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 Aug 2010 12:25:03 +0200
Subject: cfq-iosched: blktrace print per slice sector stats

o Divyesh had gotten rid of this code in the past. I want to re-introduce it
  back as it helps me a lot during debugging.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Divyesh Shah <dpshah@google.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 85e48192754..f65c6f01c47 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -148,6 +148,8 @@ struct cfq_queue {
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
+	/* Number of sectors dispatched from queue in single dispatch round */
+	unsigned long nr_sectors;
 };
 
 /*
@@ -960,8 +962,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u",
-			used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd));
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
+			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
+			iops_mode(cfqd), cfqq->nr_sectors);
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
@@ -1609,6 +1612,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
+		cfqq->nr_sectors = 0;
 
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1971,6 +1975,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
+	cfqq->nr_sectors += blk_rq_sectors(rq);
 	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
 					rq_data_dir(rq), rq_is_sync(rq));
 }
-- 
cgit v1.2.3


From c87ffbb812cf6150097a5095b031f4013a8f78a5 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Mon, 23 Aug 2010 12:30:29 +0200
Subject: block: put dev->kobj in blk_register_queue fail path

kernel needs to kobject_put on dev->kobj if elv_register_queue fails.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Nikanth Karthikesan <knikanth@suse.de>
Cc: David Teigland <teigland@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-sysfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f..0749b89c688 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -511,6 +511,7 @@ int blk_register_queue(struct gendisk *disk)
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
 		blk_trace_remove_sysfs(disk_to_dev(disk));
+		kobject_put(&dev->kobj);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From 5e00d1b5b4c10fb839afd5ce61db8e24339454b0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 12 Aug 2010 14:31:06 +0200
Subject: BLOCK: fix bio.bi_rw handling

Return of the bi_rw tests is no longer bool after commit 74450be1. But
results of such tests are stored in bools. This doesn't fit in there
for some compilers (gcc 4.5 here), so either use !! magic to get real
bools or use ulong where the result is assigned somewhere.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63c..32a1c123dfb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1198,9 +1198,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
-	const bool sync = (bio->bi_rw & REQ_SYNC);
-	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
-	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	const bool sync = !!(bio->bi_rw & REQ_SYNC);
+	const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+	const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
 	if ((bio->bi_rw & REQ_HARDBARRIER) &&
-- 
cgit v1.2.3


From 5dd531a03ad721b41911ddb32e6e0481404e7aaf Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 23 Aug 2010 13:52:19 +0200
Subject: block: add function call to switch the IO scheduler from a driver

Currently drivers must do an elevator_exit() + elevator_init()
to switch IO schedulers. There are a few problems with this:

- Since commit 1abec4fdbb142e3ccb6ce99832fae42129134a96,
  elevator_init() requires a zeroed out q->elevator
  pointer. The two existing in-kernel users don't do that.

- It will only work at initialization time, since using the
  above two-staged construct does not properly quisce the queue.

So add elevator_change() which takes care of this, and convert
the elv_iosched_store() sysfs interface to use this helper as well.

Reported-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Reported-by: Kevin Vigor <kevin@vigor.nu>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index ec585c9554d..205b09a5bd9 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1009,18 +1009,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old_elevator, *e;
 	void *data;
+	int err;
 
 	/*
 	 * Allocate new elevator
 	 */
 	e = elevator_alloc(q, new_e);
 	if (!e)
-		return 0;
+		return -ENOMEM;
 
 	data = elevator_init_queue(q, e);
 	if (!data) {
 		kobject_put(&e->kobj);
-		return 0;
+		return -ENOMEM;
 	}
 
 	/*
@@ -1043,7 +1044,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
 	__elv_unregister_queue(old_elevator);
 
-	if (elv_register_queue(q))
+	err = elv_register_queue(q);
+	if (err)
 		goto fail_register;
 
 	/*
@@ -1056,7 +1058,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
 	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
 
-	return 1;
+	return 0;
 
 fail_register:
 	/*
@@ -1071,17 +1073,19 @@ fail_register:
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 	spin_unlock_irq(q->queue_lock);
 
-	return 0;
+	return err;
 }
 
-ssize_t elv_iosched_store(struct request_queue *q, const char *name,
-			  size_t count)
+/*
+ * Switch this queue to the given IO scheduler.
+ */
+int elevator_change(struct request_queue *q, const char *name)
 {
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
 	if (!q->elevator)
-		return count;
+		return -ENXIO;
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name));
@@ -1092,13 +1096,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 
 	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
 		elevator_put(e);
-		return count;
+		return 0;
 	}
 
-	if (!elevator_switch(q, e))
-		printk(KERN_ERR "elevator: switch to %s failed\n",
-							elevator_name);
-	return count;
+	return elevator_switch(q, e);
+}
+EXPORT_SYMBOL(elevator_change);
+
+ssize_t elv_iosched_store(struct request_queue *q, const char *name,
+			  size_t count)
+{
+	int ret;
+
+	if (!q->elevator)
+		return count;
+
+	ret = elevator_change(q, name);
+	if (!ret)
+		return count;
+
+	printk(KERN_ERR "elevator: switch to %s failed\n", name);
+	return ret;
 }
 
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
-- 
cgit v1.2.3


From be14eb619108fa8b7120eb2c42d66d5f623ae10e Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Fri, 10 Sep 2010 09:03:21 +0200
Subject: block: Range check cpu in blk_cpu_to_group

While testing CPU DLPAR, the following problem was discovered.
We were DLPAR removing the first CPU, which in this case was
logical CPUs 0-3. CPUs 0-2 were already marked offline and
we were in the process of offlining CPU 3. After marking
the CPU inactive and offline in cpu_disable, but before the
cpu was completely idle (cpu_die), we ended up in __make_request
on CPU 3. There we looked at the topology map to see which CPU
to complete the I/O on and found no CPUs in the cpu_sibling_map.
This resulted in the block layer setting the completion cpu
to be NR_CPUS, which then caused an oops when we tried to
complete the I/O.

Fix this by sanity checking the value we return from blk_cpu_to_group
to be a valid cpu value.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e..d6b911ac002 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 
 static inline int blk_cpu_to_group(int cpu)
 {
+	int group = NR_CPUS;
 #ifdef CONFIG_SCHED_MC
 	const struct cpumask *mask = cpu_coregroup_mask(cpu);
-	return cpumask_first(mask);
+	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	return cpumask_first(topology_thread_cpumask(cpu));
+	group = cpumask_first(topology_thread_cpumask(cpu));
 #else
 	return cpu;
 #endif
+	if (likely(group < NR_CPUS))
+		return group;
+	return cpu;
 }
 
 /*
-- 
cgit v1.2.3


From a45dc2d2b8d1afa57c91dcfac224e50ffcd3f805 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 13 Sep 2010 21:32:19 +0200
Subject: block: fix blk_rq_map_kern bio direction flag

This bug was introduced in 7b6d91daee5cac6402186ff224c3af39d79f4a0e
"block: unify flags for struct bio and struct request"

Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f..ade0a08c909 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		return PTR_ERR(bio);
 
 	if (rq_data_dir(rq) == WRITE)
-		bio->bi_rw |= (1 << REQ_WRITE);
+		bio->bi_rw |= REQ_WRITE;
 
 	if (do_copy)
 		rq->cmd_flags |= REQ_COPY_USER;
-- 
cgit v1.2.3


From 180be2a0420a0d680285e9352c0db85016b9ed2a Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 14 Sep 2010 08:47:11 +0200
Subject: cfq-iosched: fix a kernel OOPs when usb key is inserted

Mike reported a kernel crash when a usb key hotplug is performed while all
kernel thrads are not in a root cgroup and are running in one of the child
cgroups of blkio controller.

	BUG: unable to handle kernel NULL pointer dereference at 0000002c
	IP: [<c11c7b08>] cfq_get_queue+0x232/0x412
	*pde = 00000000
	Oops: 0000 [#1] PREEMPT
	last sysfs file: /sys/devices/pci0000:00/0000:00:1d.7/usb2/2-1/2-1:1.0/host3/scsi_host/host3/uevent

	[..]
	Pid: 30039, comm: scsi_scan_3 Not tainted 2.6.35.2-fg.roam #1 Volvi2                         /Aspire 4315
	EIP: 0060:[<c11c7b08>] EFLAGS: 00010086 CPU: 0
	EIP is at cfq_get_queue+0x232/0x412
	EAX: f705f9c0 EBX: e977abac ECX: 00000000 EDX: 00000000
	ESI: f00da400 EDI: f00da4ec EBP: e977a800 ESP: dff8fd00
	 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
	Process scsi_scan_3 (pid: 30039, ti=dff8e000 task=f6b6c9a0 task.ti=dff8e000)
	Stack:
	 00000000 00000000 00000001 01ff0000 f00da508 00000000 f00da524 f00da540
	<0> e7994940 dd631750 f705f9c0 e977a820 e977ac44 f00da4d0 00000001 f6b6c9a0
	<0> 00000010 00008010 0000000b 00000000 00000001 e977a800 dd76fac0 00000246
	Call Trace:
	 [<c11c7f10>] ? cfq_set_request+0x228/0x34c
	 [<c11c7ce8>] ? cfq_set_request+0x0/0x34c
	 [<c11bb3b9>] ? elv_set_request+0xf/0x1c
	 [<c11bdd51>] ? get_request+0x1ad/0x22f
	 [<c11bddf2>] ? get_request_wait+0x1f/0x11a
	 [<c11d013b>] ? kvasprintf+0x33/0x3b
	 [<c127b537>] ? scsi_execute+0x1d/0x103
	 [<c127b675>] ? scsi_execute_req+0x58/0x83
	 [<c127c391>] ? scsi_probe_and_add_lun+0x188/0x7c2
	 [<c12718c6>] ? attribute_container_add_device+0x15/0xfa
	 [<c11c95d1>] ? kobject_get+0xf/0x13
	 [<c126d1db>] ? get_device+0x10/0x14
	 [<c127be93>] ? scsi_alloc_target+0x217/0x24d
	 [<c127cbd8>] ? __scsi_scan_target+0x95/0x480
	 [<c10204eb>] ? dequeue_entity+0x14/0x1fe
	 [<c1020491>] ? update_curr+0x165/0x1ab
	 [<c1020491>] ? update_curr+0x165/0x1ab
	 [<c127d00d>] ? scsi_scan_channel+0x4a/0x76
	 [<c127d0b0>] ? scsi_scan_host_selected+0x77/0xad
	 [<c127d13c>] ? do_scan_async+0x0/0x11a
	 [<c127d137>] ? do_scsi_scan_host+0x51/0x56
	 [<c127d13c>] ? do_scan_async+0x0/0x11a
	 [<c127d14a>] ? do_scan_async+0xe/0x11a
	 [<c127d13c>] ? do_scan_async+0x0/0x11a
	 [<c10354c5>] ? kthread+0x5e/0x63
	 [<c1035467>] ? kthread+0x0/0x63
	 [<c1002af6>] ? kernel_thread_helper+0x6/0x10
	Code: 44 24 1c 54 83 44 24 18 54 83 fa 03 75 94 8b 06 c7 86 64 02 00 00 01 00 00 00 83 e0 03 09 f0 89 06 8b 44 24 28 8b 90 58 01 00 00 <8b> 42 2c 85 c0 75 03 8b 42 08 8d 54 24 48 52 8d 4c 24 50 51 68
	EIP: [<c11c7b08>] cfq_get_queue+0x232/0x412 SS:ESP 0068:dff8fd00
	CR2: 000000000000002c
	---[ end trace 9a88306573f69b12 ]---

The problem here is that we don't have bdi->dev information available when
thread does some IO.  Hence when dev_name() tries to access bdi->dev, it
crashes.

This problem does not happen if kernel threads are in root group as root
group is statically allocated at device initialization time and we don't
hit this piece of code.

Fix it by delaying the filling of major and minor number information of
device in blk_group.  Initially a blk_group is created with 0 as device
information and this information is filled later once some more IO comes
in from same group.

Reported-by: Mike Kazantsev <mk.fraggod@gmail.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f65c6f01c47..9eba291eb6f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1019,10 +1019,20 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 */
 	atomic_set(&cfqg->ref, 1);
 
-	/* Add group onto cgroup list */
-	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+	/*
+	 * Add group onto cgroup list. It might happen that bdi->dev is
+	 * not initiliazed yet. Initialize this new group without major
+	 * and minor info and this info will be filled in once a new thread
+	 * comes for IO. See code above.
+	 */
+	if (bdi->dev) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
+	} else
+		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+					0);
+
 	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
-- 
cgit v1.2.3


From f281fb5fe54e15a7ab802945e42f8e24fceb56b2 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Sat, 25 Sep 2010 12:42:55 +0200
Subject: block: prevent merges of discard and write requests

Add logic to prevent two I/O requests being merged if
only one of them is a discard.  Ditto secure discard.

Without this fix, it is possible for write requests
to transform into discard requests.  For example:

  Submit bio 1 to discard 8 sectors from sector n
  Submit bio 2 to write 8 sectors from sector n + 16
  Submit bio 3 to write 8 sectors from sector n + 8

Bio 1 becomes request 1.  Bio 2 becomes request 2.
Bio 3 is merged with request 2, and then subsequently
request 2 is merged with request 1 resulting in just
one I/O request which discards all 24 sectors.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>

(Moved the checks above the position checks /Jens)

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-merge.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd424967..eafc94f68d7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -361,6 +361,18 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return 0;
 
+	/*
+	 * Don't merge file system requests and discard requests
+	 */
+	if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
+		return 0;
+
+	/*
+	 * Don't merge discard requests and secure discard requests
+	 */
+	if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
+		return 0;
+
 	/*
 	 * not contiguous
 	 */
-- 
cgit v1.2.3


From 430c62fb2948d964cf8dc7f3e2f69623c04ef62f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 7 Oct 2010 09:35:16 +0200
Subject: elevator: fix oops on early call to elevator_change()

2.6.36 introduces an API for drivers to switch the IO scheduler
instead of manually calling the elevator exit and init functions.
This API was added since q->elevator must be cleared in between
those two calls. And since we already have this functionality
directly from use by the sysfs interface to switch schedulers
online, it was prudent to reuse it internally too.

But this API needs the queue to be in a fully initialized state
before it is called, or it will attempt to unregister elevator
kobjects before they have been added. This results in an oops
like this:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000051
IP: [<ffffffff8116f15e>] sysfs_create_dir+0x2e/0xc0
PGD 47ddfc067 PUD 47c6a1067 PMD 0
Oops: 0000 [#1] PREEMPT SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/0000:04:00.1/irq
CPU 2
Modules linked in: t(+) loop hid_apple usbhid ahci ehci_hcd uhci_hcd libahci usbcore nls_base igb

Pid: 7319, comm: modprobe Not tainted 2.6.36-rc6+ #132 QSSC-S4R/QSSC-S4R
RIP: 0010:[<ffffffff8116f15e>]  [<ffffffff8116f15e>] sysfs_create_dir+0x2e/0xc0
RSP: 0018:ffff88027da25d08  EFLAGS: 00010246
RAX: ffff88047c68c528 RBX: 00000000fffffffe RCX: 0000000000000000
RDX: 000000000000002f RSI: 000000000000002f RDI: ffff88047e196c88
RBP: ffff88027da25d38 R08: 0000000000000000 R09: d84156c5635688c0
R10: d84156c5635688c0 R11: 0000000000000000 R12: ffff88047e196c88
R13: 0000000000000000 R14: 0000000000000000 R15: ffff88047c68c528
FS:  00007fcb0b26f6e0(0000) GS:ffff880287400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000051 CR3: 000000047e76e000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process modprobe (pid: 7319, threadinfo ffff88027da24000, task ffff88027d377090)
Stack:
 ffff88027da25d58 ffff88047c68c528 00000000fffffffe ffff88047e196c88
<0> ffff88047c68c528 ffff88047e05bd90 ffff88027da25d78 ffffffff8123fb77
<0> ffff88047e05bd90 0000000000000000 ffff88047e196c88 ffff88047c68c528
Call Trace:
 [<ffffffff8123fb77>] kobject_add_internal+0xe7/0x1f0
 [<ffffffff8123fd98>] kobject_add_varg+0x38/0x60
 [<ffffffff8123feb9>] kobject_add+0x69/0x90
 [<ffffffff8116efe0>] ? sysfs_remove_dir+0x20/0xa0
 [<ffffffff8103d48d>] ? sub_preempt_count+0x9d/0xe0
 [<ffffffff8143de20>] ? _raw_spin_unlock+0x30/0x50
 [<ffffffff8116efe0>] ? sysfs_remove_dir+0x20/0xa0
 [<ffffffff8116eff4>] ? sysfs_remove_dir+0x34/0xa0
 [<ffffffff81224204>] elv_register_queue+0x34/0xa0
 [<ffffffff81224aad>] elevator_change+0xfd/0x250
 [<ffffffffa007e000>] ? t_init+0x0/0x361 [t]
 [<ffffffffa007e000>] ? t_init+0x0/0x361 [t]
 [<ffffffffa007e0a8>] t_init+0xa8/0x361 [t]
 [<ffffffff810001de>] do_one_initcall+0x3e/0x170
 [<ffffffff8108c3fd>] sys_init_module+0xbd/0x220
 [<ffffffff81002f2b>] system_call_fastpath+0x16/0x1b
Code: e5 41 56 41 55 41 54 49 89 fc 53 48 83 ec 10 48 85 ff 74 52 48 8b 47 18 49 c7 c5 00 46 61 81 48 85 c0 74 04 4c 8b 68 30 45 31 f6 <41> 80 7d 51 00 74 0e 49 8b 44 24 28 4c 89 e7 ff 50 20 49 89 c6
RIP  [<ffffffff8116f15e>] sysfs_create_dir+0x2e/0xc0
 RSP <ffff88027da25d08>
CR2: 0000000000000051
---[ end trace a6541d3bf07945df ]---

Fix this by adding a registered bit to the elevator queue, which is
set when the sysfs kobjects have been registered.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 205b09a5bd9..4e11559aa2b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -938,6 +938,7 @@ int elv_register_queue(struct request_queue *q)
 			}
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
+		e->registered = 1;
 	}
 	return error;
 }
@@ -947,6 +948,7 @@ static void __elv_unregister_queue(struct elevator_queue *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
+	e->registered = 0;
 }
 
 void elv_unregister_queue(struct request_queue *q)
@@ -1042,11 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
 	spin_unlock_irq(q->queue_lock);
 
-	__elv_unregister_queue(old_elevator);
+	if (old_elevator->registered) {
+		__elv_unregister_queue(old_elevator);
 
-	err = elv_register_queue(q);
-	if (err)
-		goto fail_register;
+		err = elv_register_queue(q);
+		if (err)
+			goto fail_register;
+	}
 
 	/*
 	 * finally exit old elevator and turn off BYPASS.
-- 
cgit v1.2.3