From 96aa1b419d47286db446f292cf898bb1a8b27f24 Mon Sep 17 00:00:00 2001 From: Ciju Rajan K Date: Mon, 23 Aug 2010 10:56:30 +0200 Subject: blkio: Fix return code for mkdir calls If the cgroup hierarchy for blkio control groups is deeper than two levels, kernel should not allow the creation of further levels. mkdir system call does not except EINVAL as a return value. This patch replaces EINVAL with more appropriate EPERM Signed-off-by: Ciju Rajan K Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a6809645d21..2fef1ef931a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -966,7 +966,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) /* Currently we do not support hierarchy deeper than two level (0,1) */ if (parent != cgroup->top_cgroup) - return ERR_PTR(-EINVAL); + return ERR_PTR(-EPERM); blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) -- cgit v1.2.3 From b6508c1618e7aab085f191efb41b7b019a94ea38 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:23:33 +0200 Subject: cfq-iosched: Do not idle if slice_idle=0 Do not idle either on cfq queue or service tree if slice_idle=0. User does not want any queue or service tree idling. Currently even if slice_idle=0, we were waiting for request to finish before expiring the queue and that can lead to lower queue depths. Acked-by: Jeff Moyer Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index eb4086f7dfe..8830569542c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1839,6 +1839,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!service_tree); BUG_ON(!service_tree->count); + if (!cfqd->cfq_slice_idle) + return false; + /* We never do for idle class queues. */ if (prio == IDLE_WORKLOAD) return false; @@ -1879,7 +1882,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) + if (!cfq_should_idle(cfqd, cfqq)) return; /* -- cgit v1.2.3 From 02b35081fc98f681411586d3acf9eaad8b8f6e07 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:23:53 +0200 Subject: cfq-iosched: Do group share accounting in IOPS when slice_idle=0 o Implement another CFQ mode where we charge group in terms of number of requests dispatched instead of measuring the time. Measuring in terms of time is not possible when we are driving deeper queue depths and there are requests from multiple cfq queues in the request queue. o This mode currently gets activated if one sets slice_idle=0 and associated disk supports NCQ. Again the idea is that on an NCQ disk with idling disabled most of the queues will dispatch 1 or more requests and then cfq queue expiry happens and we don't have a way to measure time. So start providing fairness in terms of IOPS. o Currently IOPS mode works only with cfq group scheduling. CFQ is following different scheduling algorithms for queue and group scheduling. These IOPS stats are used only for group scheduling hence in non-croup mode nothing should change. o For CFQ group scheduling one can disable slice idling so that we don't idle on queue and drive deeper request queue depths (achieving better throughput), at the same time group idle is enabled so one should get service differentiation among groups. Signed-off-by: Vivek Goyal Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8830569542c..3fc6be110c1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -378,6 +378,21 @@ CFQ_CFQQ_FNS(wait_busy); &cfqg->service_trees[i][j]: NULL) \ +static inline bool iops_mode(struct cfq_data *cfqd) +{ + /* + * If we are not idling on queues and it is a NCQ drive, parallel + * execution of requests is on and measuring time is not possible + * in most of the cases until and unless we drive shallower queue + * depths and that becomes a performance bottleneck. In such cases + * switch to start providing fairness in terms of number of IOs. + */ + if (!cfqd->cfq_slice_idle && cfqd->hw_tag) + return true; + else + return false; +} + static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) @@ -906,7 +921,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) slice_used = cfqq->allocated_slice; } - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); return slice_used; } @@ -914,19 +928,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl, charge_sl; + unsigned int used_sl, charge; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; BUG_ON(nr_sync < 0); - used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + used_sl = charge = cfq_cfqq_slice_usage(cfqq); - if (!cfq_cfqq_sync(cfqq) && !nr_sync) - charge_sl = cfqq->allocated_slice; + if (iops_mode(cfqd)) + charge = cfqq->slice_dispatch; + else if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge = cfqq->allocated_slice; /* Can't update vdisktime while group is on service tree */ cfq_rb_erase(&cfqg->rb_node, st); - cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge, cfqg); __cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ @@ -940,6 +956,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u", + used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd)); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } -- cgit v1.2.3 From 80bdf0c78fff075dfa21576273c8b0c7db22bdfe Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:24:26 +0200 Subject: cfq-iosched: Implement tunable group_idle o Implement a new tunable group_idle, which allows idling on the group instead of a cfq queue. Hence one can set slice_idle = 0 and not idle on the individual queues but idle on the group. This way on fast storage we can get fairness between groups at the same time overall throughput improves. Signed-off-by: Vivek Goyal Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 65 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3fc6be110c1..85e48192754 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_group_idle = HZ / 125; static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; @@ -198,6 +199,8 @@ struct cfq_group { struct hlist_node cfqd_node; atomic_t ref; #endif + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; }; /* @@ -271,6 +274,7 @@ struct cfq_data { unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; + unsigned int cfq_group_idle; unsigned int cfq_latency; unsigned int cfq_group_isolation; @@ -1884,7 +1888,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; - unsigned long sl; + unsigned long sl, group_idle = 0; /* * SSD device without seek penalty, disable idling. But only do so @@ -1900,8 +1904,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfq_should_idle(cfqd, cfqq)) - return; + if (!cfq_should_idle(cfqd, cfqq)) { + /* no queue idling. Check for group idling */ + if (cfqd->cfq_group_idle) + group_idle = cfqd->cfq_group_idle; + else + return; + } /* * still active requests from this queue, don't idle @@ -1928,13 +1937,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; } + /* There are other queues in the group, don't do group idle */ + if (group_idle && cfqq->cfqg->nr_cfqq > 1) + return; + cfq_mark_cfqq_wait_request(cfqq); - sl = cfqd->cfq_slice_idle; + if (group_idle) + sl = cfqd->cfq_group_idle; + else + sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, + group_idle ? 1 : 0); } /* @@ -1950,6 +1967,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; + (RQ_CFQG(rq))->dispatched++; elv_dispatch_sort(q, rq); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; @@ -2219,7 +2237,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) cfqq = NULL; goto keep_queue; } else - goto expire; + goto check_group_idle; } /* @@ -2247,8 +2265,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * flight or is idling for a new request, allow either of these * conditions to happen (or time out) before selecting a new queue. */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { + if (timer_pending(&cfqd->idle_slice_timer)) { + cfqq = NULL; + goto keep_queue; + } + + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } + + /* + * If group idle is enabled and there are requests dispatched from + * this group, wait for requests to complete. + */ +check_group_idle: + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 + && cfqq->cfqg->dispatched) { cfqq = NULL; goto keep_queue; } @@ -3396,6 +3429,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; cfqq->dispatched--; + (RQ_CFQG(rq))->dispatched--; cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq_data_dir(rq), rq_is_sync(rq)); @@ -3425,7 +3459,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * the queue. */ if (cfq_should_wait_busy(cfqd, cfqq)) { - cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; + unsigned long extend_sl = cfqd->cfq_slice_idle; + if (!cfqd->cfq_slice_idle) + extend_sl = cfqd->cfq_group_idle; + cfqq->slice_end = jiffies + extend_sl; cfq_mark_cfqq_wait_busy(cfqq); cfq_log_cfqq(cfqd, cfqq, "will busy wait"); } @@ -3871,6 +3908,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; @@ -3943,6 +3981,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); @@ -3975,6 +4014,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, @@ -3996,6 +4036,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), CFQ_ATTR(group_isolation), __ATTR_NULL @@ -4049,6 +4090,12 @@ static int __init cfq_init(void) if (!cfq_slice_idle) cfq_slice_idle = 1; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfq_group_idle) + cfq_group_idle = 1; +#else + cfq_group_idle = 0; +#endif if (cfq_slab_setup()) return -ENOMEM; -- cgit v1.2.3 From c4e7893ebc3a5c507b53f59b9de448db20849944 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:25:03 +0200 Subject: cfq-iosched: blktrace print per slice sector stats o Divyesh had gotten rid of this code in the past. I want to re-introduce it back as it helps me a lot during debugging. Reviewed-by: Jeff Moyer Reviewed-by: Divyesh Shah Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 85e48192754..f65c6f01c47 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -148,6 +148,8 @@ struct cfq_queue { struct cfq_queue *new_cfqq; struct cfq_group *cfqg; struct cfq_group *orig_cfqg; + /* Number of sectors dispatched from queue in single dispatch round */ + unsigned long nr_sectors; }; /* @@ -960,8 +962,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u", - used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd)); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" + " sect=%u", used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } @@ -1609,6 +1612,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -1971,6 +1975,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) elv_dispatch_sort(q, rq); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; + cfqq->nr_sectors += blk_rq_sectors(rq); cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), rq_data_dir(rq), rq_is_sync(rq)); } -- cgit v1.2.3 From c87ffbb812cf6150097a5095b031f4013a8f78a5 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 23 Aug 2010 12:30:29 +0200 Subject: block: put dev->kobj in blk_register_queue fail path kernel needs to kobject_put on dev->kobj if elv_register_queue fails. Signed-off-by: Xiaotian Feng Cc: "Martin K. Petersen" Cc: Stephen Hemminger Cc: Nikanth Karthikesan Cc: David Teigland Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 001ab18078f..0749b89c688 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -511,6 +511,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + kobject_put(&dev->kobj); return ret; } -- cgit v1.2.3 From 5e00d1b5b4c10fb839afd5ce61db8e24339454b0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Aug 2010 14:31:06 +0200 Subject: BLOCK: fix bio.bi_rw handling Return of the bi_rw tests is no longer bool after commit 74450be1. But results of such tests are stored in bools. This doesn't fit in there for some compilers (gcc 4.5 here), so either use !! magic to get real bools or use ulong where the result is assigned somewhere. Signed-off-by: Jiri Slaby Cc: Christoph Hellwig Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ee1a1e7e63c..32a1c123dfb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1198,9 +1198,9 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const bool sync = (bio->bi_rw & REQ_SYNC); - const bool unplug = (bio->bi_rw & REQ_UNPLUG); - const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; + const bool sync = !!(bio->bi_rw & REQ_SYNC); + const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); + const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; if ((bio->bi_rw & REQ_HARDBARRIER) && -- cgit v1.2.3 From 5dd531a03ad721b41911ddb32e6e0481404e7aaf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Aug 2010 13:52:19 +0200 Subject: block: add function call to switch the IO scheduler from a driver Currently drivers must do an elevator_exit() + elevator_init() to switch IO schedulers. There are a few problems with this: - Since commit 1abec4fdbb142e3ccb6ce99832fae42129134a96, elevator_init() requires a zeroed out q->elevator pointer. The two existing in-kernel users don't do that. - It will only work at initialization time, since using the above two-staged construct does not properly quisce the queue. So add elevator_change() which takes care of this, and convert the elv_iosched_store() sysfs interface to use this helper as well. Reported-by: Peter Oberparleiter Reported-by: Kevin Vigor Signed-off-by: Jens Axboe --- block/elevator.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index ec585c9554d..205b09a5bd9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1009,18 +1009,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old_elevator, *e; void *data; + int err; /* * Allocate new elevator */ e = elevator_alloc(q, new_e); if (!e) - return 0; + return -ENOMEM; data = elevator_init_queue(q, e); if (!data) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; } /* @@ -1043,7 +1044,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) __elv_unregister_queue(old_elevator); - if (elv_register_queue(q)) + err = elv_register_queue(q); + if (err) goto fail_register; /* @@ -1056,7 +1058,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); - return 1; + return 0; fail_register: /* @@ -1071,17 +1073,19 @@ fail_register: queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); - return 0; + return err; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, - size_t count) +/* + * Switch this queue to the given IO scheduler. + */ +int elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; if (!q->elevator) - return count; + return -ENXIO; strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name)); @@ -1092,13 +1096,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { elevator_put(e); - return count; + return 0; } - if (!elevator_switch(q, e)) - printk(KERN_ERR "elevator: switch to %s failed\n", - elevator_name); - return count; + return elevator_switch(q, e); +} +EXPORT_SYMBOL(elevator_change); + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, + size_t count) +{ + int ret; + + if (!q->elevator) + return count; + + ret = elevator_change(q, name); + if (!ret) + return count; + + printk(KERN_ERR "elevator: switch to %s failed\n", name); + return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) -- cgit v1.2.3 From be14eb619108fa8b7120eb2c42d66d5f623ae10e Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 10 Sep 2010 09:03:21 +0200 Subject: block: Range check cpu in blk_cpu_to_group While testing CPU DLPAR, the following problem was discovered. We were DLPAR removing the first CPU, which in this case was logical CPUs 0-3. CPUs 0-2 were already marked offline and we were in the process of offlining CPU 3. After marking the CPU inactive and offline in cpu_disable, but before the cpu was completely idle (cpu_die), we ended up in __make_request on CPU 3. There we looked at the topology map to see which CPU to complete the I/O on and found no CPUs in the cpu_sibling_map. This resulted in the block layer setting the completion cpu to be NR_CPUS, which then caused an oops when we tried to complete the I/O. Fix this by sanity checking the value we return from blk_cpu_to_group to be a valid cpu value. Signed-off-by: Brian King Signed-off-by: Jens Axboe --- block/blk.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 6e7dc87141e..d6b911ac002 100644 --- a/block/blk.h +++ b/block/blk.h @@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) static inline int blk_cpu_to_group(int cpu) { + int group = NR_CPUS; #ifdef CONFIG_SCHED_MC const struct cpumask *mask = cpu_coregroup_mask(cpu); - return cpumask_first(mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return cpumask_first(topology_thread_cpumask(cpu)); + group = cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif + if (likely(group < NR_CPUS)) + return group; + return cpu; } /* -- cgit v1.2.3 From a45dc2d2b8d1afa57c91dcfac224e50ffcd3f805 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Mon, 13 Sep 2010 21:32:19 +0200 Subject: block: fix blk_rq_map_kern bio direction flag This bug was introduced in 7b6d91daee5cac6402186ff224c3af39d79f4a0e "block: unify flags for struct bio and struct request" Cc: Boaz Harrosh Signed-off-by: Benny Halevy Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index c65d7593f7f..ade0a08c909 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return PTR_ERR(bio); if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << REQ_WRITE); + bio->bi_rw |= REQ_WRITE; if (do_copy) rq->cmd_flags |= REQ_COPY_USER; -- cgit v1.2.3 From 180be2a0420a0d680285e9352c0db85016b9ed2a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 14 Sep 2010 08:47:11 +0200 Subject: cfq-iosched: fix a kernel OOPs when usb key is inserted Mike reported a kernel crash when a usb key hotplug is performed while all kernel thrads are not in a root cgroup and are running in one of the child cgroups of blkio controller. BUG: unable to handle kernel NULL pointer dereference at 0000002c IP: [] cfq_get_queue+0x232/0x412 *pde = 00000000 Oops: 0000 [#1] PREEMPT last sysfs file: /sys/devices/pci0000:00/0000:00:1d.7/usb2/2-1/2-1:1.0/host3/scsi_host/host3/uevent [..] Pid: 30039, comm: scsi_scan_3 Not tainted 2.6.35.2-fg.roam #1 Volvi2 /Aspire 4315 EIP: 0060:[] EFLAGS: 00010086 CPU: 0 EIP is at cfq_get_queue+0x232/0x412 EAX: f705f9c0 EBX: e977abac ECX: 00000000 EDX: 00000000 ESI: f00da400 EDI: f00da4ec EBP: e977a800 ESP: dff8fd00 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 Process scsi_scan_3 (pid: 30039, ti=dff8e000 task=f6b6c9a0 task.ti=dff8e000) Stack: 00000000 00000000 00000001 01ff0000 f00da508 00000000 f00da524 f00da540 <0> e7994940 dd631750 f705f9c0 e977a820 e977ac44 f00da4d0 00000001 f6b6c9a0 <0> 00000010 00008010 0000000b 00000000 00000001 e977a800 dd76fac0 00000246 Call Trace: [] ? cfq_set_request+0x228/0x34c [] ? cfq_set_request+0x0/0x34c [] ? elv_set_request+0xf/0x1c [] ? get_request+0x1ad/0x22f [] ? get_request_wait+0x1f/0x11a [] ? kvasprintf+0x33/0x3b [] ? scsi_execute+0x1d/0x103 [] ? scsi_execute_req+0x58/0x83 [] ? scsi_probe_and_add_lun+0x188/0x7c2 [] ? attribute_container_add_device+0x15/0xfa [] ? kobject_get+0xf/0x13 [] ? get_device+0x10/0x14 [] ? scsi_alloc_target+0x217/0x24d [] ? __scsi_scan_target+0x95/0x480 [] ? dequeue_entity+0x14/0x1fe [] ? update_curr+0x165/0x1ab [] ? update_curr+0x165/0x1ab [] ? scsi_scan_channel+0x4a/0x76 [] ? scsi_scan_host_selected+0x77/0xad [] ? do_scan_async+0x0/0x11a [] ? do_scsi_scan_host+0x51/0x56 [] ? do_scan_async+0x0/0x11a [] ? do_scan_async+0xe/0x11a [] ? do_scan_async+0x0/0x11a [] ? kthread+0x5e/0x63 [] ? kthread+0x0/0x63 [] ? kernel_thread_helper+0x6/0x10 Code: 44 24 1c 54 83 44 24 18 54 83 fa 03 75 94 8b 06 c7 86 64 02 00 00 01 00 00 00 83 e0 03 09 f0 89 06 8b 44 24 28 8b 90 58 01 00 00 <8b> 42 2c 85 c0 75 03 8b 42 08 8d 54 24 48 52 8d 4c 24 50 51 68 EIP: [] cfq_get_queue+0x232/0x412 SS:ESP 0068:dff8fd00 CR2: 000000000000002c ---[ end trace 9a88306573f69b12 ]--- The problem here is that we don't have bdi->dev information available when thread does some IO. Hence when dev_name() tries to access bdi->dev, it crashes. This problem does not happen if kernel threads are in root group as root group is statically allocated at device initialization time and we don't hit this piece of code. Fix it by delaying the filling of major and minor number information of device in blk_group. Initially a blk_group is created with 0 as device information and this information is filled later once some more IO comes in from same group. Reported-by: Mike Kazantsev Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f65c6f01c47..9eba291eb6f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1019,10 +1019,20 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) */ atomic_set(&cfqg->ref, 1); - /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initiliazed yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. See code above. + */ + if (bdi->dev) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + 0); + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); /* Add group on cfqd list */ -- cgit v1.2.3 From f281fb5fe54e15a7ab802945e42f8e24fceb56b2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Sat, 25 Sep 2010 12:42:55 +0200 Subject: block: prevent merges of discard and write requests Add logic to prevent two I/O requests being merged if only one of them is a discard. Ditto secure discard. Without this fix, it is possible for write requests to transform into discard requests. For example: Submit bio 1 to discard 8 sectors from sector n Submit bio 2 to write 8 sectors from sector n + 16 Submit bio 3 to write 8 sectors from sector n + 8 Bio 1 becomes request 1. Bio 2 becomes request 2. Bio 3 is merged with request 2, and then subsequently request 2 is merged with request 1 resulting in just one I/O request which discards all 24 sectors. Signed-off-by: Adrian Hunter (Moved the checks above the position checks /Jens) Signed-off-by: Jens Axboe --- block/blk-merge.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 3b0cd424967..eafc94f68d7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -361,6 +361,18 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; + /* + * Don't merge file system requests and discard requests + */ + if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) + return 0; + + /* + * Don't merge discard requests and secure discard requests + */ + if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) + return 0; + /* * not contiguous */ -- cgit v1.2.3 From 430c62fb2948d964cf8dc7f3e2f69623c04ef62f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Oct 2010 09:35:16 +0200 Subject: elevator: fix oops on early call to elevator_change() 2.6.36 introduces an API for drivers to switch the IO scheduler instead of manually calling the elevator exit and init functions. This API was added since q->elevator must be cleared in between those two calls. And since we already have this functionality directly from use by the sysfs interface to switch schedulers online, it was prudent to reuse it internally too. But this API needs the queue to be in a fully initialized state before it is called, or it will attempt to unregister elevator kobjects before they have been added. This results in an oops like this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000051 IP: [] sysfs_create_dir+0x2e/0xc0 PGD 47ddfc067 PUD 47c6a1067 PMD 0 Oops: 0000 [#1] PREEMPT SMP last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/0000:04:00.1/irq CPU 2 Modules linked in: t(+) loop hid_apple usbhid ahci ehci_hcd uhci_hcd libahci usbcore nls_base igb Pid: 7319, comm: modprobe Not tainted 2.6.36-rc6+ #132 QSSC-S4R/QSSC-S4R RIP: 0010:[] [] sysfs_create_dir+0x2e/0xc0 RSP: 0018:ffff88027da25d08 EFLAGS: 00010246 RAX: ffff88047c68c528 RBX: 00000000fffffffe RCX: 0000000000000000 RDX: 000000000000002f RSI: 000000000000002f RDI: ffff88047e196c88 RBP: ffff88027da25d38 R08: 0000000000000000 R09: d84156c5635688c0 R10: d84156c5635688c0 R11: 0000000000000000 R12: ffff88047e196c88 R13: 0000000000000000 R14: 0000000000000000 R15: ffff88047c68c528 FS: 00007fcb0b26f6e0(0000) GS:ffff880287400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000051 CR3: 000000047e76e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 7319, threadinfo ffff88027da24000, task ffff88027d377090) Stack: ffff88027da25d58 ffff88047c68c528 00000000fffffffe ffff88047e196c88 <0> ffff88047c68c528 ffff88047e05bd90 ffff88027da25d78 ffffffff8123fb77 <0> ffff88047e05bd90 0000000000000000 ffff88047e196c88 ffff88047c68c528 Call Trace: [] kobject_add_internal+0xe7/0x1f0 [] kobject_add_varg+0x38/0x60 [] kobject_add+0x69/0x90 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sub_preempt_count+0x9d/0xe0 [] ? _raw_spin_unlock+0x30/0x50 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sysfs_remove_dir+0x34/0xa0 [] elv_register_queue+0x34/0xa0 [] elevator_change+0xfd/0x250 [] ? t_init+0x0/0x361 [t] [] ? t_init+0x0/0x361 [t] [] t_init+0xa8/0x361 [t] [] do_one_initcall+0x3e/0x170 [] sys_init_module+0xbd/0x220 [] system_call_fastpath+0x16/0x1b Code: e5 41 56 41 55 41 54 49 89 fc 53 48 83 ec 10 48 85 ff 74 52 48 8b 47 18 49 c7 c5 00 46 61 81 48 85 c0 74 04 4c 8b 68 30 45 31 f6 <41> 80 7d 51 00 74 0e 49 8b 44 24 28 4c 89 e7 ff 50 20 49 89 c6 RIP [] sysfs_create_dir+0x2e/0xc0 RSP CR2: 0000000000000051 ---[ end trace a6541d3bf07945df ]--- Fix this by adding a registered bit to the elevator queue, which is set when the sysfs kobjects have been registered. Signed-off-by: Jens Axboe --- block/elevator.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 205b09a5bd9..4e11559aa2b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -938,6 +938,7 @@ int elv_register_queue(struct request_queue *q) } } kobject_uevent(&e->kobj, KOBJ_ADD); + e->registered = 1; } return error; } @@ -947,6 +948,7 @@ static void __elv_unregister_queue(struct elevator_queue *e) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + e->registered = 0; } void elv_unregister_queue(struct request_queue *q) @@ -1042,11 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) spin_unlock_irq(q->queue_lock); - __elv_unregister_queue(old_elevator); + if (old_elevator->registered) { + __elv_unregister_queue(old_elevator); - err = elv_register_queue(q); - if (err) - goto fail_register; + err = elv_register_queue(q); + if (err) + goto fail_register; + } /* * finally exit old elevator and turn off BYPASS. -- cgit v1.2.3