From bd900d4580107c899d43b262fbbd995f11097a43 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Apr 2011 22:06:57 +0200 Subject: block: kill blk_flush_plug_list() export With all drivers and file systems converted, we only have in-core use of this function. So remove the export. Reporteed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 5fa3dd2705c6..580eee5743e5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2787,7 +2787,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) local_irq_restore(flags); } -EXPORT_SYMBOL(blk_flush_plug_list); void blk_finish_plug(struct blk_plug *plug) { -- cgit v1.2.3 From 5f45c69589b7d2953584e6cd0b31e35dbe960ad0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Apr 2011 09:10:35 +0200 Subject: cfq-iosched: read_lock() does not always imply rcu_read_lock() For some configurations of CONFIG_PREEMPT that is not true. So get rid of __call_for_each_cic() and always uses the explicitly rcu_read_lock() protected call_for_each_cic() instead. This fixes a potential bug related to IO scheduler removal or online switching. Thanks to Paul McKenney for clarifying this. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 46b0a1d1d925..5b52011e3a40 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2582,28 +2582,20 @@ static void cfq_put_queue(struct cfq_queue *cfqq) } /* - * Must always be called with the rcu_read_lock() held + * Call func for each cic attached to this ioc. */ static void -__call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) +call_for_each_cic(struct io_context *ioc, + void (*func)(struct io_context *, struct cfq_io_context *)) { struct cfq_io_context *cic; struct hlist_node *n; + rcu_read_lock(); + hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) func(ioc, cic); -} -/* - * Call func for each cic attached to this ioc. - */ -static void -call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) -{ - rcu_read_lock(); - __call_for_each_cic(ioc, func); rcu_read_unlock(); } @@ -2664,7 +2656,7 @@ static void cfq_free_io_context(struct io_context *ioc) * should be ok to iterate over the known list, we will see all cic's * since no new ones are added. */ - __call_for_each_cic(ioc, cic_free_func); + call_for_each_cic(ioc, cic_free_func); } static void cfq_put_cooperator(struct cfq_queue *cfqq) -- cgit v1.2.3 From c21e6beba8835d09bb80e34961430b13e60381c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Apr 2011 13:32:46 +0200 Subject: block: get rid of QUEUE_FLAG_REENTER We are currently using this flag to check whether it's safe to call into ->request_fn(). If it is set, we punt to kblockd. But we get a lot of false positives and excessive punts to kblockd, which hurts performance. The only real abuser of this infrastructure is SCSI. So export the async queue run and convert SCSI over to use that. There's room for improvement in that SCSI need not always use the async call, but this fixes our performance issue and they can fix that up in due time. Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ++--------- block/blk.h | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 580eee5743e5..40725b9091f1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -303,15 +303,7 @@ void __blk_run_queue(struct request_queue *q) if (unlikely(blk_queue_stopped(q))) return; - /* - * Only recurse once to avoid overrunning the stack, let the unplug - * handling reinvoke the handler shortly if we already got there. - */ - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else - queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); @@ -328,6 +320,7 @@ void blk_run_queue_async(struct request_queue *q) if (likely(!blk_queue_stopped(q))) queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); } +EXPORT_SYMBOL(blk_run_queue_async); /** * blk_run_queue - run a single device queue diff --git a/block/blk.h b/block/blk.h index c9df8fc3c999..61263463e38e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -22,7 +22,6 @@ void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); void blk_add_timer(struct request *); void __generic_unplug_device(struct request_queue *); -void blk_run_queue_async(struct request_queue *q); /* * Internal atomic flags for request handling -- cgit v1.2.3 From d350e6b6e819df0a383ff34465720bfaa0f91c79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Apr 2011 13:34:14 +0200 Subject: block: remove stale kerneldoc member from __blk_run_queue() We don't pass in a 'force_kblockd' anymore, get rid of the stsale comment. Reported-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 40725b9091f1..a2e58eeb3549 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -292,7 +292,6 @@ EXPORT_SYMBOL(blk_sync_queue); /** * __blk_run_queue - run a single device queue * @q: The queue to run - * @force_kblockd: Don't run @q->request_fn directly. Use kblockd. * * Description: * See @blk_run_queue. This variant must be called with the queue lock -- cgit v1.2.3 From ed5302d3c25006a9edc7a7fbea97a30483f89ef7 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 19 Apr 2011 13:47:58 +0200 Subject: block, blk-sysfs: Fix an err return path in blk_register_queue() We do not call blk_trace_remove_sysfs() in err return path if kobject_add() fails. This path fixes it. Cc: stable@kernel.org Signed-off-by: Liu Yuan Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6d735122bc59..5d696adbc4dc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -508,8 +508,10 @@ int blk_register_queue(struct gendisk *disk) return ret; ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) + if (ret < 0) { + blk_trace_remove_sysfs(dev); return ret; + } kobject_uevent(&q->kobj, KOBJ_ADD); -- cgit v1.2.3 From 60735b6362f29b52b5635a2dfa9ab5ad39948345 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 19 Apr 2011 13:50:40 +0200 Subject: block: Remove the extra check in queue_requests_store In queue_requests_store, the code looks like if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_SYNC); } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { blk_clear_queue_full(q, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } If we don't satify the situation of "if", we can get that rl->count[BLK_RW_SYNC} < q->nr_quests. It is the same as rl->count[BLK_RW_SYNC]+1 <= q->nr_requests. All the "else" should satisfy the "else if" check so it isn't needed actually. Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5d696adbc4dc..bd236313f35d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_SYNC); - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { + } else { blk_clear_queue_full(q, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_ASYNC); - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { + } else { blk_clear_queue_full(q, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } -- cgit v1.2.3 From 3aa72873ffdcc2f7919743efbbefc351ec73f5cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Apr 2011 19:28:35 +0200 Subject: elevator: check for ELEVATOR_INSERT_SORT_MERGE in !elvpriv case too The sort insert is the one that goes to the IO scheduler. With the SORT_MERGE addition, we could bypass IO scheduler setup but still ask the IO scheduler to insert the request. This would cause an oops on switching IO schedulers through the sysfs interface, unless the disk just happened to be idle while it occured. Signed-off-by: Jens Axboe --- block/elevator.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 6f6abc08bb56..45ca1e34f582 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -671,7 +671,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) q->boundary_rq = rq; } } else if (!(rq->cmd_flags & REQ_ELVPRIV) && - where == ELEVATOR_INSERT_SORT) + (where == ELEVATOR_INSERT_SORT || + where == ELEVATOR_INSERT_SORT_MERGE)) where = ELEVATOR_INSERT_BACK; switch (where) { -- cgit v1.2.3 From 7c88a168da8003fd4d8fb6ae103c4ecf29cb1130 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2011 19:43:58 +0200 Subject: block: don't propagate unlisted DISK_EVENTs to userland DISK_EVENT_MEDIA_CHANGE is used for both userland visible event and internal event for revalidation of removeable devices. Some legacy drivers don't implement proper event detection and continuously generate events under certain circumstances. For example, ide-cd generates media changed continuously if there's no media in the drive, which can lead to infinite loop of events jumping back and forth between the driver and userland event handler. This patch updates disk event infrastructure such that it never propagates events not listed in disk->events to userland. Those events are processed the same for internal purposes but uevent generation is suppressed. This also ensures that userland only gets events which are advertised in the @events sysfs node lowering risk of confusion. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b364bd038a18..2dd988723d73 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work) spin_unlock_irq(&ev->lock); - /* tell userland about new events */ + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported. Unlisted events are processed the + * same internally but never get reported to userland. + */ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if (events & (1 << i)) + if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) -- cgit v1.2.3 From 70087dc38cc77ca8f46059564c00338777734762 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 16 May 2011 15:24:08 +0200 Subject: blk-throttle: Use task_subsys_state() to determine a task's blkio_cgroup Currentlly we first map the task to cgroup and then cgroup to blkio_cgroup. There is a more direct way to get to blkio_cgroup from task using task_subsys_state(). Use that. The real reason for the fix is that it also avoids a race in generic cgroup code. During remount/umount rebind_subsystems() is called and it can do following with and rcu protection. cgrp->subsys[i] = NULL; That means if somebody got hold of cgroup under rcu and then it tried to do cgroup->subsys[] to get to blkio_cgroup, it would get NULL which is wrong. I was running into this race condition with ltp running on a upstream derived kernel and that lead to crash. So ideally we should also fix cgroup generic code to wait for rcu grace period before setting pointer to NULL. Li Zefan is not very keen on introducing synchronize_wait() as he thinks it will slow down moun/remount/umount operations. So for the time being atleast fix the kernel crash by taking a more direct route to blkio_cgroup. One tester had reported a crash while running LTP on a derived kernel and with this fix crash is no more seen while the test has been running for over 6 days. Signed-off-by: Vivek Goyal Reviewed-by: Li Zefan Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 +++++++ block/blk-cgroup.h | 3 +++ block/blk-throttle.c | 9 ++++----- block/cfq-iosched.c | 11 +++++------ 4 files changed, 19 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f0605ab2a761..471fdcc5df85 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -114,6 +114,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(task_blkio_cgroup); + static inline void blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) { diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 10919fae2d3a..c774930cc206 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -291,6 +291,7 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid); @@ -314,6 +315,8 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } +static inline struct blkio_cgroup * +task_blkio_cgroup(struct task_struct *tsk) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0475a22a420d..252a81a306f7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -160,9 +160,8 @@ static void throtl_put_tg(struct throtl_grp *tg) } static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, - struct cgroup *cgroup) + struct blkio_cgroup *blkcg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); struct throtl_grp *tg = NULL; void *key = td; struct backing_dev_info *bdi = &td->queue->backing_dev_info; @@ -229,12 +228,12 @@ done: static struct throtl_grp * throtl_get_tg(struct throtl_data *td) { - struct cgroup *cgroup; struct throtl_grp *tg = NULL; + struct blkio_cgroup *blkcg; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - tg = throtl_find_alloc_tg(td, cgroup); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_alloc_tg(td, blkcg); if (!tg) tg = &td->root_tg; rcu_read_unlock(); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5b52011e3a40..ab7a9e6a9b1c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1014,10 +1014,9 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, cfqg->needs_update = true; } -static struct cfq_group * -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, + struct blkio_cgroup *blkcg, int create) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); struct cfq_group *cfqg = NULL; void *key = cfqd; int i, j; @@ -1079,12 +1078,12 @@ done: */ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) { - struct cgroup *cgroup; + struct blkio_cgroup *blkcg; struct cfq_group *cfqg = NULL; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); if (!cfqg && create) cfqg = &cfqd->root_group; rcu_read_unlock(); -- cgit v1.2.3 From 3ec717b7ca4ee1d75d77e4f6286430d8f01d1dbd Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 18 May 2011 11:22:43 +0200 Subject: block: don't delay blk_run_queue_async Let's check a scenario: 1. blk_delay_queue(q, SCSI_QUEUE_DELAY); 2. blk_run_queue_async(); the second one will became a noop, because q->delay_work already has WORK_STRUCT_PENDING_BIT set, so the delayed work will still run after SCSI_QUEUE_DELAY. But blk_run_queue_async actually hopes the delayed work runs immediately. Fix this by doing a cancel on potentially pending delayed work before queuing an immediate run of the workqueue. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a2e58eeb3549..3fe00a14822a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -316,8 +316,10 @@ EXPORT_SYMBOL(__blk_run_queue); */ void blk_run_queue_async(struct request_queue *q) { - if (likely(!blk_queue_stopped(q))) + if (likely(!blk_queue_stopped(q))) { + __cancel_delayed_work(&q->delay_work); queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + } } EXPORT_SYMBOL(blk_run_queue_async); -- cgit v1.2.3