aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2023-01-13 12:01:24 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2023-01-13 12:01:24 +1100
commit5c2bd0d0b5c5d3ec3be75028b9df7d7132079294 (patch)
tree2752208037b97dc40f9f28224351a09efe0318f9
parentca3aeb5da7a9cbe7324dd8d7065e1977550cd648 (diff)
parent73f41663622d4689b3aa325c18fd604d4f4b8c58 (diff)
Merge branch 'for-next' of git://git.kernel.dk/linux-block.git
-rw-r--r--Documentation/ABI/stable/sysfs-block3
-rw-r--r--MAINTAINERS3
-rw-r--r--block/bfq-cgroup.c93
-rw-r--r--block/bfq-iosched.c587
-rw-r--r--block/bfq-iosched.h142
-rw-r--r--block/bfq-wf2q.c2
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-core.c3
-rw-r--r--block/blk-map.c8
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-sysfs.c21
-rw-r--r--drivers/block/null_blk/main.c3
-rw-r--r--drivers/nvme/host/apple.c2
-rw-r--r--drivers/nvme/host/ioctl.c110
-rw-r--r--drivers/nvme/host/pci.c12
-rw-r--r--include/linux/blkdev.h4
-rw-r--r--include/linux/io_uring_types.h15
-rw-r--r--include/linux/uio.h1
-rw-r--r--include/uapi/linux/io_uring.h2
-rw-r--r--io_uring/fdinfo.c12
-rw-r--r--io_uring/io-wq.c7
-rw-r--r--io_uring/io_uring.c290
-rw-r--r--io_uring/io_uring.h53
-rw-r--r--io_uring/msg_ring.c23
-rw-r--r--io_uring/net.c17
-rw-r--r--io_uring/poll.c48
-rw-r--r--io_uring/rw.c11
-rw-r--r--lib/iov_iter.c15
28 files changed, 986 insertions, 512 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index cd14ecb3c9a5..ac1e519272aa 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -432,7 +432,8 @@ Contact: linux-block@vger.kernel.org
Description:
[RW] This is the maximum number of kilobytes that the block
layer will allow for a filesystem request. Must be smaller than
- or equal to the maximum size allowed by the hardware.
+ or equal to the maximum size allowed by the hardware. Write 0
+ to use default kernel settings.
What: /sys/block/<disk>/queue/max_segment_size
diff --git a/MAINTAINERS b/MAINTAINERS
index 35fad4249c25..861766319ede 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14772,7 +14772,8 @@ T: git://git.infradead.org/nvme.git
F: Documentation/nvme/
F: drivers/nvme/host/
F: drivers/nvme/common/
-F: include/linux/nvme*
+F: include/linux/nvme.h
+F: include/linux/nvme-*.h
F: include/uapi/linux/nvme_ioctl.h
NVM EXPRESS FABRICS AUTHENTICATION
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 1b2829e99dad..a6e8da5f5cfd 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -708,12 +708,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_activate_bfqq(bfqd, bfqq);
}
- if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver)
bfq_schedule_dispatch(bfqd);
/* release extra ref taken above, bfqq may happen to be freed now */
bfq_put_queue(bfqq);
}
+static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
+ struct bfq_queue *sync_bfqq,
+ struct bfq_io_cq *bic,
+ struct bfq_group *bfqg,
+ unsigned int act_idx)
+{
+ struct bfq_queue *bfqq;
+
+ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+ /* We are the only user of this bfqq, just move it */
+ if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ return;
+ }
+
+ /*
+ * The queue was merged to a different queue. Check
+ * that the merge chain still belongs to the same
+ * cgroup.
+ */
+ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+ if (bfqq->entity.sched_data != &bfqg->sched_data)
+ break;
+ if (bfqq) {
+ /*
+ * Some queue changed cgroup so the merge is not valid
+ * anymore. We cannot easily just cancel the merge (by
+ * clearing new_bfqq) as there may be other processes
+ * using this queue and holding refs to all queues
+ * below sync_bfqq->new_bfqq. Similarly if the merge
+ * already happened, we need to detach from bfqq now
+ * so that we cannot merge bio to a request from the
+ * old cgroup.
+ */
+ bfq_put_cooperator(sync_bfqq);
+ bfq_release_process_ref(bfqd, sync_bfqq);
+ bic_set_bfqq(bic, NULL, true, act_idx);
+ }
+}
+
/**
* __bfq_bic_change_cgroup - move @bic to @bfqg.
* @bfqd: the queue descriptor.
@@ -728,53 +768,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd,
struct bfq_io_cq *bic,
struct bfq_group *bfqg)
{
- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false);
- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true);
- struct bfq_entity *entity;
+ unsigned int act_idx;
- if (async_bfqq) {
- entity = &async_bfqq->entity;
+ for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) {
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx);
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx);
- if (entity->sched_data != &bfqg->sched_data) {
- bic_set_bfqq(bic, NULL, false);
+ if (async_bfqq &&
+ async_bfqq->entity.sched_data != &bfqg->sched_data) {
+ bic_set_bfqq(bic, NULL, false, act_idx);
bfq_release_process_ref(bfqd, async_bfqq);
}
- }
- if (sync_bfqq) {
- if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
- /* We are the only user of this bfqq, just move it */
- if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
- } else {
- struct bfq_queue *bfqq;
-
- /*
- * The queue was merged to a different queue. Check
- * that the merge chain still belongs to the same
- * cgroup.
- */
- for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
- if (bfqq->entity.sched_data !=
- &bfqg->sched_data)
- break;
- if (bfqq) {
- /*
- * Some queue changed cgroup so the merge is
- * not valid anymore. We cannot easily just
- * cancel the merge (by clearing new_bfqq) as
- * there may be other processes using this
- * queue and holding refs to all queues below
- * sync_bfqq->new_bfqq. Similarly if the merge
- * already happened, we need to detach from
- * bfqq now so that we cannot merge bio to a
- * request from the old cgroup.
- */
- bfq_put_cooperator(sync_bfqq);
- bfq_release_process_ref(bfqd, sync_bfqq);
- bic_set_bfqq(bic, NULL, true);
- }
- }
+ if (sync_bfqq)
+ bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx);
}
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ccf2204477a5..815b884d6c5a 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600;
#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0]))
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx)
{
- return bic->bfqq[is_sync];
+ if (is_sync)
+ return bic->bfqq[1][actuator_idx];
+
+ return bic->bfqq[0][actuator_idx];
}
static void bfq_put_stable_ref(struct bfq_queue *bfqq);
-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+void bic_set_bfqq(struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq,
+ bool is_sync,
+ unsigned int actuator_idx)
{
- struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
-
- /* Clear bic pointer if bfqq is detached from this bic */
- if (old_bfqq && old_bfqq->bic == bic)
- old_bfqq->bic = NULL;
+ struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx];
/*
* If bfqq != NULL, then a non-stable queue merge between
@@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
* we cancel the stable merge if
* bic->stable_merge_bfqq == bfqq.
*/
- bic->bfqq[is_sync] = bfqq;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx];
- if (bfqq && bic->stable_merge_bfqq == bfqq) {
+ /* Clear bic pointer if bfqq is detached from this bic */
+ if (old_bfqq && old_bfqq->bic == bic)
+ old_bfqq->bic = NULL;
+
+ if (is_sync)
+ bic->bfqq[1][actuator_idx] = bfqq;
+ else
+ bic->bfqq[0][actuator_idx] = bfqq;
+
+ if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) {
/*
* Actually, these same instructions are executed also
* in bfq_setup_cooperator, in case of abort or actual
@@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
* did so, we would nest even more complexity in this
* function.
*/
- bfq_put_stable_ref(bic->stable_merge_bfqq);
+ bfq_put_stable_ref(bfqq_data->stable_merge_bfqq);
- bic->stable_merge_bfqq = NULL;
+ bfqq_data->stable_merge_bfqq = NULL;
}
}
@@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
int depth;
unsigned limit = data->q->nr_requests;
+ unsigned int act_idx;
/* Sync reads have full depth available */
if (op_is_sync(opf) && !op_is_write(opf)) {
@@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
limit = (limit * depth) >> bfqd->full_depth_shift;
}
- /*
- * Does queue (or any parent entity) exceed number of requests that
- * should be available to it? Heavily limit depth so that it cannot
- * consume more available requests and thus starve other entities.
- */
- if (bfqq && bfqq_request_over_limit(bfqq, limit))
- depth = 1;
+ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
+ struct bfq_queue *bfqq =
+ bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+ /*
+ * Does queue (or any parent entity) exceed number of
+ * requests that should be available to it? Heavily
+ * limit depth so that it cannot consume more
+ * available requests and thus starve other entities.
+ */
+ if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
+ depth = 1;
+ break;
+ }
+ }
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
if (depth)
@@ -1118,36 +1137,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
{
unsigned int old_wr_coeff = 1;
bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
- if (bic->saved_has_short_ttime)
+ if (bfqq_data->saved_has_short_ttime)
bfq_mark_bfqq_has_short_ttime(bfqq);
else
bfq_clear_bfqq_has_short_ttime(bfqq);
- if (bic->saved_IO_bound)
+ if (bfqq_data->saved_IO_bound)
bfq_mark_bfqq_IO_bound(bfqq);
else
bfq_clear_bfqq_IO_bound(bfqq);
- bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
- bfqq->inject_limit = bic->saved_inject_limit;
- bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
+ bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns;
+ bfqq->inject_limit = bfqq_data->saved_inject_limit;
+ bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif;
- bfqq->entity.new_weight = bic->saved_weight;
- bfqq->ttime = bic->saved_ttime;
- bfqq->io_start_time = bic->saved_io_start_time;
- bfqq->tot_idle_time = bic->saved_tot_idle_time;
+ bfqq->entity.new_weight = bfqq_data->saved_weight;
+ bfqq->ttime = bfqq_data->saved_ttime;
+ bfqq->io_start_time = bfqq_data->saved_io_start_time;
+ bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time;
/*
* Restore weight coefficient only if low_latency is on
*/
if (bfqd->low_latency) {
old_wr_coeff = bfqq->wr_coeff;
- bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->wr_coeff = bfqq_data->saved_wr_coeff;
}
- bfqq->service_from_wr = bic->saved_service_from_wr;
- bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
- bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
- bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+ bfqq->service_from_wr = bfqq_data->saved_service_from_wr;
+ bfqq->wr_start_at_switch_to_srt =
+ bfqq_data->saved_wr_start_at_switch_to_srt;
+ bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time;
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
@@ -1766,6 +1788,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
return bfqq_weight > in_serv_weight;
}
+/*
+ * Get the index of the actuator that will serve bio.
+ */
+static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio)
+{
+ unsigned int i;
+ sector_t end;
+
+ /* no search needed if one or zero ranges present */
+ if (bfqd->num_actuators == 1)
+ return 0;
+
+ /* bio_end_sector(bio) gives the sector after the last one */
+ end = bio_end_sector(bio) - 1;
+
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ if (end >= bfqd->sector[i] &&
+ end < bfqd->sector[i] + bfqd->nr_sectors[i])
+ return i;
+ }
+
+ WARN_ONCE(true,
+ "bfq_actuator_index: bio sector out of ranges: end=%llu\n",
+ end);
+ return 0;
+}
+
static bool bfq_better_to_idle(struct bfq_queue *bfqq);
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
@@ -1785,7 +1834,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
arrived_in_time = ktime_get_ns() <=
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
-
+ unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
+ bool bfqq_non_merged_or_stably_merged =
+ bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged;
/*
* bfqq deserves to be weight-raised if:
@@ -1819,9 +1870,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
*/
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
- (bfq_bfqq_sync(bfqq) &&
- (bfqq->bic || RQ_BIC(rq)->stably_merged) &&
- (*interactive || soft_rt)));
+ (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged &&
+ (*interactive || soft_rt)));
/*
* Using the last flag, update budget and check whether bfqq
@@ -2098,7 +2148,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* We reset waker detection logic also if too much time has passed
* since the first detection. If wakeups are rare, pointless idling
* doesn't hurt throughput that much. The condition below makes sure
- * we do not uselessly idle blocking waker in more than 1/64 cases.
+ * we do not uselessly idle blocking waker in more than 1/64 cases.
*/
if (bfqd->last_completed_rq_bfqq !=
bfqq->tentative_waker_bfqq ||
@@ -2209,9 +2259,9 @@ static void bfq_add_request(struct request *rq)
* elapsed.
*/
if (bfqq == bfqd->in_service_queue &&
- (bfqd->rq_in_driver == 0 ||
+ (bfqd->tot_rq_in_driver == 0 ||
(bfqq->last_serv_time_ns > 0 &&
- bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
+ bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
@@ -2235,7 +2285,7 @@ static void bfq_add_request(struct request *rq)
* will be set in case injection is performed
* on bfqq before rq is completed).
*/
- if (bfqd->rq_in_driver == 0)
+ if (bfqd->tot_rq_in_driver == 0)
bfqd->rqs_injected = false;
}
}
@@ -2418,7 +2468,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
*/
bfq_bic_update_cgroup(bic, bio);
- bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf),
+ bfq_actuator_index(bfqd, bio));
} else {
bfqd->bio_bfqq = NULL;
}
@@ -2584,24 +2635,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
void bfq_end_wr_async_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg)
{
- int i, j;
+ int i, j, k;
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_NR_LEVELS; j++)
- if (bfqg->async_bfqq[i][j])
- bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
- if (bfqg->async_idle_bfqq)
- bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+ for (k = 0; k < bfqd->num_actuators; k++) {
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+ if (bfqg->async_bfqq[i][j][k])
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]);
+ if (bfqg->async_idle_bfqq[k])
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]);
+ }
}
static void bfq_end_wr(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq;
+ int i;
spin_lock_irq(&bfqd->lock);
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
- bfq_bfqq_end_wr(bfqq);
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ }
list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
bfq_bfqq_end_wr(bfqq);
bfq_end_wr_async(bfqd);
@@ -2794,6 +2850,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
struct bfq_queue *bfqq);
+static struct bfq_queue *
+bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_queue *stable_merge_bfqq,
+ struct bfq_iocq_bfqq_data *bfqq_data)
+{
+ int proc_ref = min(bfqq_process_refs(bfqq),
+ bfqq_process_refs(stable_merge_bfqq));
+ struct bfq_queue *new_bfqq;
+
+ if (idling_boosts_thr_without_issues(bfqd, bfqq) ||
+ proc_ref == 0)
+ return NULL;
+
+ /* next function will take at least one ref */
+ new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq);
+
+ if (new_bfqq) {
+ bfqq_data->stably_merged = true;
+ if (new_bfqq->bic) {
+ unsigned int new_a_idx = new_bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *new_bfqq_data =
+ &new_bfqq->bic->bfqq_data[new_a_idx];
+
+ new_bfqq_data->stably_merged = true;
+ }
+ }
+ return new_bfqq;
+}
+
/*
* Attempt to schedule a merge of bfqq with the currently in-service
* queue or with a close queue among the scheduled queues. Return
@@ -2819,6 +2904,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
void *io_struct, bool request, struct bfq_io_cq *bic)
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/* if a merge has already been setup, then proceed with that first */
if (bfqq->new_bfqq)
@@ -2840,37 +2927,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* stable merging) also if bic is associated with a
* sync queue, but this bfqq is async
*/
- if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
+ if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq &&
!bfq_bfqq_just_created(bfqq) &&
time_is_before_jiffies(bfqq->split_time +
msecs_to_jiffies(bfq_late_stable_merging)) &&
time_is_before_jiffies(bfqq->creation_time +
msecs_to_jiffies(bfq_late_stable_merging))) {
struct bfq_queue *stable_merge_bfqq =
- bic->stable_merge_bfqq;
- int proc_ref = min(bfqq_process_refs(bfqq),
- bfqq_process_refs(stable_merge_bfqq));
+ bfqq_data->stable_merge_bfqq;
/* deschedule stable merge, because done or aborted here */
bfq_put_stable_ref(stable_merge_bfqq);
- bic->stable_merge_bfqq = NULL;
-
- if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
- proc_ref > 0) {
- /* next function will take at least one ref */
- struct bfq_queue *new_bfqq =
- bfq_setup_merge(bfqq, stable_merge_bfqq);
-
- if (new_bfqq) {
- bic->stably_merged = true;
- if (new_bfqq->bic)
- new_bfqq->bic->stably_merged =
- true;
- }
- return new_bfqq;
- } else
- return NULL;
+ bfqq_data->stable_merge_bfqq = NULL;
+
+ return bfq_setup_stable_merge(bfqd, bfqq,
+ stable_merge_bfqq,
+ bfqq_data);
}
}
@@ -2965,6 +3038,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
{
struct bfq_io_cq *bic = bfqq->bic;
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/*
* If !bfqq->bic, the queue is already shared or its requests
@@ -2974,18 +3049,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
if (!bic)
return;
- bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
- bic->saved_inject_limit = bfqq->inject_limit;
- bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
-
- bic->saved_weight = bfqq->entity.orig_weight;
- bic->saved_ttime = bfqq->ttime;
- bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
- bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
- bic->saved_io_start_time = bfqq->io_start_time;
- bic->saved_tot_idle_time = bfqq->tot_idle_time;
- bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
- bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+ bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
+ bfqq_data->saved_inject_limit = bfqq->inject_limit;
+ bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif;
+
+ bfqq_data->saved_weight = bfqq->entity.orig_weight;
+ bfqq_data->saved_ttime = bfqq->ttime;
+ bfqq_data->saved_has_short_ttime =
+ bfq_bfqq_has_short_ttime(bfqq);
+ bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bfqq_data->saved_io_start_time = bfqq->io_start_time;
+ bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time;
+ bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bfqq_data->was_in_burst_list =
+ !hlist_unhashed(&bfqq->burst_list_node);
+
if (unlikely(bfq_bfqq_just_created(bfqq) &&
!bfq_bfqq_in_large_burst(bfqq) &&
bfqq->bfqd->low_latency)) {
@@ -2998,17 +3076,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
* to bfqq, so that to avoid that bfqq unjustly fails
* to enjoy weight raising if split soon.
*/
- bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
- bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
- bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
- bic->saved_last_wr_start_finish = jiffies;
+ bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bfqq_data->saved_wr_start_at_switch_to_srt =
+ bfq_smallest_from_now();
+ bfqq_data->saved_wr_cur_max_time =
+ bfq_wr_duration(bfqq->bfqd);
+ bfqq_data->saved_last_wr_start_finish = jiffies;
} else {
- bic->saved_wr_coeff = bfqq->wr_coeff;
- bic->saved_wr_start_at_switch_to_srt =
+ bfqq_data->saved_wr_coeff = bfqq->wr_coeff;
+ bfqq_data->saved_wr_start_at_switch_to_srt =
bfqq->wr_start_at_switch_to_srt;
- bic->saved_service_from_wr = bfqq->service_from_wr;
- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ bfqq_data->saved_service_from_wr =
+ bfqq->service_from_wr;
+ bfqq_data->saved_last_wr_start_finish =
+ bfqq->last_wr_start_finish;
+ bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
}
}
@@ -3114,7 +3196,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
/*
* Merge queues (that is, let bic redirect its requests to new_bfqq)
*/
- bic_set_bfqq(bic, new_bfqq, true);
+ bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx);
bfq_mark_bfqq_coop(new_bfqq);
/*
* new_bfqq now belongs to at least two bics (it is a shared queue):
@@ -3532,13 +3614,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
* - start a new observation interval with this dispatch
*/
if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
- bfqd->rq_in_driver == 0)
+ bfqd->tot_rq_in_driver == 0)
goto update_rate_and_reset;
/* Update sampling information */
bfqd->peak_rate_samples++;
- if ((bfqd->rq_in_driver > 0 ||
+ if ((bfqd->tot_rq_in_driver > 0 ||
now_ns - bfqd->last_completion < BFQ_MIN_TT)
&& !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
bfqd->sequential_samples++;
@@ -3803,10 +3885,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
return false;
return (bfqq->wr_coeff > 1 &&
- (bfqd->wr_busy_queues <
- tot_busy_queues ||
- bfqd->rq_in_driver >=
- bfqq->dispatched + 4)) ||
+ (bfqd->wr_busy_queues < tot_busy_queues ||
+ bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) ||
bfq_asymmetric_scenario(bfqd, bfqq) ||
tot_busy_queues == 1;
}
@@ -4577,6 +4657,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
unsigned int limit = in_serv_bfqq->inject_limit;
+ int i;
+
/*
* If
* - bfqq is not weight-raised and therefore does not carry
@@ -4608,7 +4690,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
)
limit = 1;
- if (bfqd->rq_in_driver >= limit)
+ if (bfqd->tot_rq_in_driver >= limit)
return NULL;
/*
@@ -4623,11 +4705,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
* (and re-added only if it gets new requests, but then it
* is assigned again enough budget for its new backlog).
*/
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
- if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
- (in_serv_always_inject || bfqq->wr_coeff > 1) &&
- bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
- bfq_bfqq_budget_left(bfqq)) {
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ (in_serv_always_inject || bfqq->wr_coeff > 1) &&
+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+ bfq_bfqq_budget_left(bfqq)) {
/*
* Allow for only one large in-flight request
* on non-rotational devices, for the
@@ -4652,22 +4735,77 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
else
limit = in_serv_bfqq->inject_limit;
- if (bfqd->rq_in_driver < limit) {
+ if (bfqd->tot_rq_in_driver < limit) {
bfqd->rqs_injected = true;
return bfqq;
}
}
+ }
+
+ return NULL;
+}
+
+static struct bfq_queue *
+bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
+{
+ struct bfq_queue *bfqq;
+
+ if (bfqd->in_service_queue &&
+ bfqd->in_service_queue->actuator_idx == idx)
+ return bfqd->in_service_queue;
+
+ list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) {
+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+ bfq_bfqq_budget_left(bfqq)) {
+ return bfqq;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Perform a linear scan of each actuator, until an actuator is found
+ * for which the following three conditions hold: the load of the
+ * actuator is below the threshold (see comments on
+ * actuator_load_threshold for details) and lower than that of the
+ * next actuator (comments on this extra condition below), and there
+ * is a queue that contains I/O for that actuator. On success, return
+ * that queue.
+ *
+ * Performing a plain linear scan entails a prioritization among
+ * actuators. The extra condition above breaks this prioritization and
+ * tends to distribute injection uniformly across actuators.
+ */
+static struct bfq_queue *
+bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd)
+{
+ int i;
+
+ for (i = 0 ; i < bfqd->num_actuators; i++) {
+ if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold &&
+ (i == bfqd->num_actuators - 1 ||
+ bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) {
+ struct bfq_queue *bfqq =
+ bfq_find_active_bfqq_for_actuator(bfqd, i);
+
+ if (bfqq)
+ return bfqq;
+ }
+ }
return NULL;
}
+
/*
* Select a queue for service. If we have a current queue in service,
* check whether to continue servicing it, or retrieve and set a new one.
*/
static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
{
- struct bfq_queue *bfqq;
+ struct bfq_queue *bfqq, *inject_bfqq;
struct request *next_rq;
enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
@@ -4690,6 +4828,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
check_queue:
/*
+ * If some actuator is underutilized, but the in-service
+ * queue does not contain I/O for that actuator, then try to
+ * inject I/O for that actuator.
+ */
+ inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd);
+ if (inject_bfqq && inject_bfqq != bfqq)
+ return inject_bfqq;
+
+ /*
* This loop is rarely executed more than once. Even when it
* happens, it is much more convenient to re-execute this loop
* than to return NULL and trigger a new dispatch to get a
@@ -4748,11 +4895,8 @@ check_queue:
*/
if (bfq_bfqq_wait_request(bfqq) ||
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
- struct bfq_queue *async_bfqq =
- bfqq->bic && bfqq->bic->bfqq[0] &&
- bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
- bfqq->bic->bfqq[0]->next_rq ?
- bfqq->bic->bfqq[0] : NULL;
+ unsigned int act_idx = bfqq->actuator_idx;
+ struct bfq_queue *async_bfqq = NULL;
struct bfq_queue *blocked_bfqq =
!hlist_empty(&bfqq->woken_list) ?
container_of(bfqq->woken_list.first,
@@ -4760,6 +4904,10 @@ check_queue:
woken_list_node)
: NULL;
+ if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] &&
+ bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) &&
+ bfqq->bic->bfqq[0][act_idx]->next_rq)
+ async_bfqq = bfqq->bic->bfqq[0][act_idx];
/*
* The next four mutually-exclusive ifs decide
* whether to try injection, and choose the queue to
@@ -4844,7 +4992,7 @@ check_queue:
icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
bfq_bfqq_budget_left(async_bfqq))
- bfqq = bfqq->bic->bfqq[0];
+ bfqq = bfqq->bic->bfqq[0][act_idx];
else if (bfqq->waker_bfqq &&
bfq_bfqq_busy(bfqq->waker_bfqq) &&
bfqq->waker_bfqq->next_rq &&
@@ -5043,11 +5191,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
/*
* We exploit the bfq_finish_requeue_request hook to
- * decrement rq_in_driver, but
+ * decrement tot_rq_in_driver, but
* bfq_finish_requeue_request will not be invoked on
* this request. So, to avoid unbalance, just start
- * this request, without incrementing rq_in_driver. As
- * a negative consequence, rq_in_driver is deceptively
+ * this request, without incrementing tot_rq_in_driver. As
+ * a negative consequence, tot_rq_in_driver is deceptively
* lower than it should be while this request is in
* service. This may cause bfq_schedule_dispatch to be
* invoked uselessly.
@@ -5056,7 +5204,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* bfq_finish_requeue_request hook, if defined, is
* probably invoked also on this request. So, by
* exploiting this hook, we could 1) increment
- * rq_in_driver here, and 2) decrement it in
+ * tot_rq_in_driver here, and 2) decrement it in
* bfq_finish_requeue_request. Such a solution would
* let the value of the counter be always accurate,
* but it would entail using an extra interface
@@ -5085,7 +5233,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* Of course, serving one request at a time may cause loss of
* throughput.
*/
- if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+ if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0)
goto exit;
bfqq = bfq_select_queue(bfqd);
@@ -5096,7 +5244,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
if (rq) {
inc_in_driver_start_rq:
- bfqd->rq_in_driver++;
+ bfqd->rq_in_driver[bfqq->actuator_idx]++;
+ bfqd->tot_rq_in_driver++;
start_rq:
rq->rq_flags |= RQF_STARTED;
}
@@ -5305,48 +5454,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_release_process_ref(bfqd, bfqq);
}
-static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx)
{
- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx);
struct bfq_data *bfqd;
if (bfqq)
bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
if (bfqq && bfqd) {
- unsigned long flags;
-
- spin_lock_irqsave(&bfqd->lock, flags);
- bic_set_bfqq(bic, NULL, is_sync);
+ bic_set_bfqq(bic, NULL, is_sync, actuator_idx);
bfq_exit_bfqq(bfqd, bfqq);
- spin_unlock_irqrestore(&bfqd->lock, flags);
}
}
static void bfq_exit_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ unsigned long flags;
+ unsigned int act_idx;
+ /*
+ * If bfqd and thus bfqd->num_actuators is not available any
+ * longer, then cycle over all possible per-actuator bfqqs in
+ * next loop. We rely on bic being zeroed on creation, and
+ * therefore on its unused per-actuator fields being NULL.
+ */
+ unsigned int num_actuators = BFQ_MAX_ACTUATORS;
+ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
- if (bic->stable_merge_bfqq) {
- struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
+ /*
+ * bfqd is NULL if scheduler already exited, and in that case
+ * this is the last time these queues are accessed.
+ */
+ if (bfqd) {
+ spin_lock_irqsave(&bfqd->lock, flags);
+ num_actuators = bfqd->num_actuators;
+ }
- /*
- * bfqd is NULL if scheduler already exited, and in
- * that case this is the last time bfqq is accessed.
- */
- if (bfqd) {
- unsigned long flags;
+ for (act_idx = 0; act_idx < num_actuators; act_idx++) {
+ if (bfqq_data[act_idx].stable_merge_bfqq)
+ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
- spin_lock_irqsave(&bfqd->lock, flags);
- bfq_put_stable_ref(bic->stable_merge_bfqq);
- spin_unlock_irqrestore(&bfqd->lock, flags);
- } else {
- bfq_put_stable_ref(bic->stable_merge_bfqq);
- }
+ bfq_exit_icq_bfqq(bic, true, act_idx);
+ bfq_exit_icq_bfqq(bic, false, act_idx);
}
- bfq_exit_icq_bfqq(bic, true);
- bfq_exit_icq_bfqq(bic, false);
+ if (bfqd)
+ spin_unlock_irqrestore(&bfqd->lock, flags);
}
/*
@@ -5423,23 +5579,25 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bic->ioprio = ioprio;
- bfqq = bic_to_bfqq(bic, false);
+ bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio));
if (bfqq) {
bfq_release_process_ref(bfqd, bfqq);
bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
- bic_set_bfqq(bic, bfqq, false);
+ bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio));
}
- bfqq = bic_to_bfqq(bic, true);
+ bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio));
if (bfqq)
bfq_set_next_ioprio_data(bfqq, bic);
}
static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct bfq_io_cq *bic, pid_t pid, int is_sync)
+ struct bfq_io_cq *bic, pid_t pid, int is_sync,
+ unsigned int act_idx)
{
u64 now_ns = ktime_get_ns();
+ bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
INIT_HLIST_NODE(&bfqq->burst_list_node);
@@ -5503,18 +5661,18 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
struct bfq_group *bfqg,
- int ioprio_class, int ioprio)
+ int ioprio_class, int ioprio, int act_idx)
{
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
- return &bfqg->async_bfqq[0][ioprio];
+ return &bfqg->async_bfqq[0][ioprio][act_idx];
case IOPRIO_CLASS_NONE:
ioprio = IOPRIO_BE_NORM;
fallthrough;
case IOPRIO_CLASS_BE:
- return &bfqg->async_bfqq[1][ioprio];
+ return &bfqg->async_bfqq[1][ioprio][act_idx];
case IOPRIO_CLASS_IDLE:
- return &bfqg->async_idle_bfqq;
+ return &bfqg->async_idle_bfqq[act_idx];
default:
return NULL;
}
@@ -5525,6 +5683,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic,
struct bfq_queue *last_bfqq_created)
{
+ unsigned int a_idx = last_bfqq_created->actuator_idx;
struct bfq_queue *new_bfqq =
bfq_setup_merge(bfqq, last_bfqq_created);
@@ -5532,8 +5691,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
return bfqq;
if (new_bfqq->bic)
- new_bfqq->bic->stably_merged = true;
- bic->stably_merged = true;
+ new_bfqq->bic->bfqq_data[a_idx].stably_merged = true;
+ bic->bfqq_data[a_idx].stably_merged = true;
/*
* Reusing merge functions. This implies that
@@ -5608,9 +5767,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
* it has been set already, but too long ago, then move it
* forward to bfqq. Finally, move also if bfqq belongs to a
* different group than last_bfqq_created, or if bfqq has a
- * different ioprio or ioprio_class. If none of these
- * conditions holds true, then try an early stable merge or
- * schedule a delayed stable merge.
+ * different ioprio, ioprio_class or actuator_idx. If none of
+ * these conditions holds true, then try an early stable merge
+ * or schedule a delayed stable merge. As for the condition on
+ * actuator_idx, the reason is that, if queues associated with
+ * different actuators are merged, then control is lost on
+ * each actuator. Therefore some actuator may be
+ * underutilized, and throughput may decrease.
*
* A delayed merge is scheduled (instead of performing an
* early merge), in case bfqq might soon prove to be more
@@ -5628,7 +5791,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
bfqq->creation_time) ||
bfqq->entity.parent != last_bfqq_created->entity.parent ||
bfqq->ioprio != last_bfqq_created->ioprio ||
- bfqq->ioprio_class != last_bfqq_created->ioprio_class)
+ bfqq->ioprio_class != last_bfqq_created->ioprio_class ||
+ bfqq->actuator_idx != last_bfqq_created->actuator_idx)
*source_bfqq = bfqq;
else if (time_after_eq(last_bfqq_created->creation_time +
bfqd->bfq_burst_interval,
@@ -5658,7 +5822,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
/*
* Record the bfqq to merge to.
*/
- bic->stable_merge_bfqq = last_bfqq_created;
+ bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq =
+ last_bfqq_created;
}
}
@@ -5680,7 +5845,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
bfqg = bfq_bio_bfqg(bfqd, bio);
if (!is_sync) {
async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
- ioprio);
+ ioprio,
+ bfq_actuator_index(bfqd, bio));
bfqq = *async_bfqq;
if (bfqq)
goto out;
@@ -5692,7 +5858,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
if (bfqq) {
bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
- is_sync);
+ is_sync, bfq_actuator_index(bfqd, bio));
bfq_init_entity(&bfqq->entity, bfqg);
bfq_log_bfqq(bfqd, bfqq, "allocated");
} else {
@@ -6007,7 +6173,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* then complete the merge and redirect it to
* new_bfqq.
*/
- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+ if (bic_to_bfqq(RQ_BIC(rq), true,
+ bfq_actuator_index(bfqd, rq->bio)) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
@@ -6145,7 +6312,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
struct bfq_queue *bfqq = bfqd->in_service_queue;
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
- bfqd->rq_in_driver);
+ bfqd->tot_rq_in_driver);
if (bfqd->hw_tag == 1)
return;
@@ -6156,7 +6323,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
* sum is not exact, as it's not taking into account deactivated
* requests.
*/
- if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+ if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
return;
/*
@@ -6167,7 +6334,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
BFQ_HW_QUEUE_THRESHOLD &&
- bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
+ bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -6188,7 +6355,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_update_hw_tag(bfqd);
- bfqd->rq_in_driver--;
+ bfqd->rq_in_driver[bfqq->actuator_idx]--;
+ bfqd->tot_rq_in_driver--;
bfqq->dispatched--;
if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
@@ -6308,7 +6476,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
BFQQE_NO_MORE_REQUESTS);
}
- if (!bfqd->rq_in_driver)
+ if (!bfqd->tot_rq_in_driver)
bfq_schedule_dispatch(bfqd);
}
@@ -6439,13 +6607,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
* conditions to do it, or we can lower the last base value
* computed.
*
- * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+ * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O
* request in flight, because this function is in the code
* path that handles the completion of a request of bfqq, and,
* in particular, this function is executed before
- * bfqd->rq_in_driver is decremented in such a code path.
+ * bfqd->tot_rq_in_driver is decremented in such a code path.
*/
- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) ||
tot_time_ns < bfqq->last_serv_time_ns) {
if (bfqq->last_serv_time_ns == 0) {
/*
@@ -6455,7 +6623,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
}
bfqq->last_serv_time_ns = tot_time_ns;
- } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
+ } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1)
/*
* No I/O injected and no request still in service in
* the drive: these are the exact conditions for
@@ -6562,7 +6730,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
return bfqq;
}
- bic_set_bfqq(bic, NULL, true);
+ bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
bfq_put_cooperator(bfqq);
@@ -6576,7 +6744,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
bool split, bool is_sync,
bool *new_queue)
{
- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ unsigned int act_idx = bfq_actuator_index(bfqd, bio);
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx];
if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
return bfqq;
@@ -6588,14 +6758,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
bfq_put_queue(bfqq);
bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
- bic_set_bfqq(bic, bfqq, is_sync);
+ bic_set_bfqq(bic, bfqq, is_sync, act_idx);
if (split && is_sync) {
- if ((bic->was_in_burst_list && bfqd->large_burst) ||
- bic->saved_in_large_burst)
+ if ((bfqq_data->was_in_burst_list && bfqd->large_burst) ||
+ bfqq_data->saved_in_large_burst)
bfq_mark_bfqq_in_large_burst(bfqq);
else {
bfq_clear_bfqq_in_large_burst(bfqq);
- if (bic->was_in_burst_list)
+ if (bfqq_data->was_in_burst_list)
/*
* If bfqq was in the current
* burst list before being
@@ -6684,6 +6854,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
struct bfq_queue *bfqq;
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
+ unsigned int a_idx = bfq_actuator_index(bfqd, bio);
if (unlikely(!rq->elv.icq))
return NULL;
@@ -6710,12 +6881,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (likely(!new_queue)) {
/* If the queue was seeky for too long, break it apart. */
if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
- !bic->stably_merged) {
+ !bic->bfqq_data[a_idx].stably_merged) {
struct bfq_queue *old_bfqq = bfqq;
/* Update bic before losing reference to bfqq */
if (bfq_bfqq_in_large_burst(bfqq))
- bic->saved_in_large_burst = true;
+ bic->bfqq_data[a_idx].saved_in_large_burst =
+ true;
bfqq = bfq_split_bfqq(bic, bfqq);
split = true;
@@ -6898,13 +7070,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
*/
void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
{
- int i, j;
+ int i, j, k;
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_NR_LEVELS; j++)
- __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+ for (k = 0; k < bfqd->num_actuators; k++) {
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]);
- __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]);
+ }
}
/*
@@ -7016,6 +7190,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct bfq_data *bfqd;
struct elevator_queue *eq;
+ unsigned int i;
+ struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
eq = elevator_alloc(q, e);
if (!eq)
@@ -7036,8 +7212,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
* will not attempt to free it.
+ * Set zero as actuator index: we will pretend that
+ * all I/O requests are for the same actuator.
*/
- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0);
bfqd->oom_bfqq.ref++;
bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
@@ -7056,6 +7234,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->queue = q;
+ bfqd->num_actuators = 1;
+ /*
+ * If the disk supports multiple actuators, copy independent
+ * access ranges from the request queue structure.
+ */
+ spin_lock_irq(&q->queue_lock);
+ if (ia_ranges) {
+ /*
+ * Check if the disk ia_ranges size exceeds the current bfq
+ * actuator limit.
+ */
+ if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) {
+ pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n",
+ ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS);
+ pr_crit("Falling back to single actuator mode.\n");
+ } else {
+ bfqd->num_actuators = ia_ranges->nr_ia_ranges;
+
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ bfqd->sector[i] = ia_ranges->ia_range[i].sector;
+ bfqd->nr_sectors[i] =
+ ia_ranges->ia_range[i].nr_sectors;
+ }
+ }
+ }
+
+ /* Otherwise use single-actuator dev info */
+ if (bfqd->num_actuators == 1) {
+ bfqd->sector[0] = 0;
+ bfqd->nr_sectors[0] = get_capacity(q->disk);
+ }
+ spin_unlock_irq(&q->queue_lock);
+
INIT_LIST_HEAD(&bfqd->dispatch);
hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
@@ -7067,7 +7278,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->num_groups_with_pending_reqs = 0;
#endif
- INIT_LIST_HEAD(&bfqd->active_list);
+ INIT_LIST_HEAD(&bfqd->active_list[0]);
+ INIT_LIST_HEAD(&bfqd->active_list[1]);
INIT_LIST_HEAD(&bfqd->idle_list);
INIT_HLIST_HEAD(&bfqd->burst_list);
@@ -7112,6 +7324,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ /* see comments on the definition of next field inside bfq_data */
+ bfqd->actuator_load_threshold = 4;
+
spin_lock_init(&bfqd->lock);
/*
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 41aa151ccc22..058af701bbbe 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -33,6 +33,14 @@
*/
#define BFQ_SOFTRT_WEIGHT_FACTOR 100
+/*
+ * Maximum number of actuators supported. This constant is used simply
+ * to define the size of the static array that will contain
+ * per-actuator data. The current value is hopefully a good upper
+ * bound to the possible number of actuators of any actual drive.
+ */
+#define BFQ_MAX_ACTUATORS 8
+
struct bfq_entity;
/**
@@ -227,12 +235,14 @@ struct bfq_ttime {
* struct bfq_queue - leaf schedulable entity.
*
* A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async or shared between cooperating
- * processes. @cgroup holds a reference to the cgroup, to be sure that it
- * does not disappear while a bfqq still references it (mostly to avoid
- * races between request issuing and task migration followed by cgroup
- * destruction).
- * All the fields are protected by the queue lock of the containing bfqd.
+ * io_context or more, if it is async or shared between cooperating
+ * processes. Besides, it contains I/O requests for only one actuator
+ * (an io_context is associated with a different bfq_queue for each
+ * actuator it generates I/O for). @cgroup holds a reference to the
+ * cgroup, to be sure that it does not disappear while a bfqq still
+ * references it (mostly to avoid races between request issuing and
+ * task migration followed by cgroup destruction). All the fields are
+ * protected by the queue lock of the containing bfqd.
*/
struct bfq_queue {
/* reference counter */
@@ -397,24 +407,18 @@ struct bfq_queue {
* the woken queues when this queue exits.
*/
struct hlist_head woken_list;
+
+ /* index of the actuator this queue is associated with */
+ unsigned int actuator_idx;
};
/**
- * struct bfq_io_cq - per (request_queue, io_context) structure.
- */
-struct bfq_io_cq {
- /* associated io_cq structure */
- struct io_cq icq; /* must be the first member */
- /* array of two process queues, the sync and the async */
- struct bfq_queue *bfqq[2];
- /* per (request_queue, blkcg) ioprio */
- int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- uint64_t blkcg_serial_nr; /* the current blkcg serial */
-#endif
+* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq
+*/
+struct bfq_iocq_bfqq_data {
/*
* Snapshot of the has_short_time flag before merging; taken
- * to remember its value while the queue is merged, so as to
+ * to remember its values while the queue is merged, so as to
* be able to restore it in case of split.
*/
bool saved_has_short_ttime;
@@ -428,7 +432,7 @@ struct bfq_io_cq {
u64 saved_tot_idle_time;
/*
- * Same purpose as the previous fields for the value of the
+ * Same purpose as the previous fields for the values of the
* field keeping the queue's belonging to a large burst
*/
bool saved_in_large_burst;
@@ -466,6 +470,38 @@ struct bfq_io_cq {
struct bfq_queue *stable_merge_bfqq;
bool stably_merged; /* non splittable if true */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+ /*
+ * Matrix of associated process queues: first row for async
+ * queues, second row sync queues. Each row contains one
+ * column for each actuator. An I/O request generated by the
+ * process is inserted into the queue pointed by bfqq[i][j] if
+ * the request is to be served by the j-th actuator of the
+ * drive, where i==0 or i==1, depending on whether the request
+ * is async or sync. So there is a distinct queue for each
+ * actuator.
+ */
+ struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+
+ /*
+ * Persistent data for associated synchronous process queues
+ * (one queue per actuator, see field bfqq above). In
+ * particular, each of these queues may undergo a merge.
+ */
+ struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];
+
unsigned int requests; /* Number of requests this process has in flight */
};
@@ -554,7 +590,12 @@ struct bfq_data {
/* number of queued requests */
int queued;
/* number of requests dispatched and waiting for completion */
- int rq_in_driver;
+ int tot_rq_in_driver;
+ /*
+ * number of requests dispatched and waiting for completion
+ * for each actuator
+ */
+ int rq_in_driver[BFQ_MAX_ACTUATORS];
/* true if the device is non rotational and performs queueing */
bool nonrot_with_queueing;
@@ -648,8 +689,13 @@ struct bfq_data {
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget;
- /* list of all the bfq_queues active on the device */
- struct list_head active_list;
+ /*
+ * List of all the bfq_queues active for a specific actuator
+ * on the device. Keeping active queues separate on a
+ * per-actuator basis helps implementing per-actuator
+ * injection more efficiently.
+ */
+ struct list_head active_list[BFQ_MAX_ACTUATORS];
/* list of all the bfq_queues idle on the device */
struct list_head idle_list;
@@ -772,6 +818,42 @@ struct bfq_data {
*/
unsigned int word_depths[2][2];
unsigned int full_depth_shift;
+
+ /*
+ * Number of independent actuators. This is equal to 1 in
+ * case of single-actuator drives.
+ */
+ unsigned int num_actuators;
+ /*
+ * Disk independent access ranges for each actuator
+ * in this device.
+ */
+ sector_t sector[BFQ_MAX_ACTUATORS];
+ sector_t nr_sectors[BFQ_MAX_ACTUATORS];
+ struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
+
+ /*
+ * If the number of I/O requests queued in the device for a
+ * given actuator is below next threshold, then the actuator
+ * is deemed as underutilized. If this condition is found to
+ * hold for some actuator upon a dispatch, but (i) the
+ * in-service queue does not contain I/O for that actuator,
+ * while (ii) some other queue does contain I/O for that
+ * actuator, then the head I/O request of the latter queue is
+ * returned (injected), instead of the head request of the
+ * currently in-service queue.
+ *
+ * We set the threshold, empirically, to the minimum possible
+ * value for which an actuator is fully utilized, or close to
+ * be fully utilized. By doing so, injected I/O 'steals' as
+ * few drive-queue slots as possibile to the in-service
+ * queue. This reduces as much as possible the probability
+ * that the service of I/O from the in-service bfq_queue gets
+ * delayed because of slot exhaustion, i.e., because all the
+ * slots of the drive queue are filled with I/O injected from
+ * other queues (NCQ provides for 32 slots).
+ */
+ unsigned int actuator_load_threshold;
};
enum bfqq_state_flags {
@@ -937,8 +1019,8 @@ struct bfq_group {
struct bfq_data *bfqd;
- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
- struct bfq_queue *async_idle_bfqq;
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
struct bfq_entity *my_entity;
@@ -955,8 +1037,8 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
- struct bfq_queue *async_idle_bfqq;
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
struct rb_root rq_pos_tree;
};
@@ -969,8 +1051,10 @@ struct bfq_group {
extern const int bfq_timeout;
-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,
+ unsigned int actuator_idx);
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_weights_tree_add(struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index ea4c3d757fdd..7941b6f07391 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_update_active_tree(node);
if (bfqq)
- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]);
bfq_inc_active_entities(entity);
}
diff --git a/block/bio.c b/block/bio.c
index ab59a491a883..d7fbc7adfc50 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1792,6 +1792,8 @@ static int __init init_bio(void)
{
int i;
+ BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
+
bio_integrity_init();
for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
diff --git a/block/blk-core.c b/block/blk-core.c
index 9321767470dc..b5098355d8b2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -283,12 +283,9 @@ static void blk_free_queue(struct request_queue *q)
*
* Decrements the refcount of the request_queue and free it when the refcount
* reaches 0.
- *
- * Context: Can sleep.
*/
void blk_put_queue(struct request_queue *q)
{
- might_sleep();
if (refcount_dec_and_test(&q->refs))
blk_free_queue(q);
}
diff --git a/block/blk-map.c b/block/blk-map.c
index 19940c978c73..f2135e6ee8f6 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -31,7 +31,8 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
return NULL;
memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
bmd->iter = *data;
- bmd->iter.iov = bmd->iov;
+ if (iter_is_iovec(data))
+ bmd->iter.iov = bmd->iov;
return bmd;
}
@@ -641,7 +642,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
copy = true;
else if (iov_iter_is_bvec(iter))
map_bvec = true;
- else if (!iter_is_iovec(iter))
+ else if (!user_backed_iter(iter))
copy = true;
else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
@@ -682,9 +683,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask)
{
- struct iovec iov;
struct iov_iter i;
- int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i);
+ int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i);
if (unlikely(ret < 0))
return ret;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0477c4d527fe..9c9713c9269c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -40,7 +40,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
- lim->max_dev_sectors = 0;
+ lim->max_user_sectors = lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
@@ -135,7 +135,12 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
limits->max_hw_sectors = max_hw_sectors;
max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
- max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
+
+ if (limits->max_user_sectors)
+ max_sectors = min(max_sectors, limits->max_user_sectors);
+ else
+ max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS);
+
max_sectors = round_down(max_sectors,
limits->logical_block_size >> SECTOR_SHIFT);
limits->max_sectors = max_sectors;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 93d9e9c9a6ea..5486b6c57f6b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -239,19 +239,28 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
- unsigned long max_sectors_kb,
+ unsigned long var;
+ unsigned int max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
page_kb = 1 << (PAGE_SHIFT - 10);
- ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+ ssize_t ret = queue_var_store(&var, page, count);
if (ret < 0)
return ret;
- max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
+ max_sectors_kb = (unsigned int)var;
+ max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb,
q->limits.max_dev_sectors >> 1);
-
- if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
- return -EINVAL;
+ if (max_sectors_kb == 0) {
+ q->limits.max_user_sectors = 0;
+ max_sectors_kb = min(max_hw_sectors_kb,
+ BLK_DEF_MAX_SECTORS >> 1);
+ } else {
+ if (max_sectors_kb > max_hw_sectors_kb ||
+ max_sectors_kb < page_kb)
+ return -EINVAL;
+ q->limits.max_user_sectors = max_sectors_kb << 1;
+ }
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 7d28e3aa406c..4c601ca9552a 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -2123,8 +2123,7 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_physical_block_size(nullb->q, dev->blocksize);
if (!dev->max_sectors)
dev->max_sectors = queue_max_hw_sectors(nullb->q);
- dev->max_sectors = min_t(unsigned int, dev->max_sectors,
- BLK_DEF_MAX_SECTORS);
+ dev->max_sectors = min(dev->max_sectors, BLK_DEF_MAX_SECTORS);
blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
if (dev->virt_boundary)
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index e36aeb50b4ed..bf1c60edb7f9 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1493,7 +1493,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
}
ret = nvme_init_ctrl(&anv->ctrl, anv->dev, &nvme_ctrl_ops,
- NVME_QUIRK_SKIP_CID_GEN);
+ NVME_QUIRK_SKIP_CID_GEN | NVME_QUIRK_IDENTIFY_CNS);
if (ret) {
dev_err_probe(dev, ret, "Failed to initialize nvme_ctrl");
goto put_dev;
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a8639919237e..06f52db34be9 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -8,8 +8,13 @@
#include <linux/io_uring.h>
#include "nvme.h"
+enum {
+ NVME_IOCTL_VEC = (1 << 0),
+ NVME_IOCTL_PARTITION = (1 << 1),
+};
+
static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
- fmode_t mode)
+ unsigned int flags, fmode_t mode)
{
u32 effects;
@@ -17,6 +22,13 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
return true;
/*
+ * Do not allow unprivileged passthrough on partitions, as that allows an
+ * escape from the containment of the partition.
+ */
+ if (flags & NVME_IOCTL_PARTITION)
+ return false;
+
+ /*
* Do not allow unprivileged processes to send vendor specific or fabrics
* commands as we can't be sure about their effects.
*/
@@ -150,7 +162,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
static int nvme_map_user_request(struct request *req, u64 ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
- bool vec)
+ unsigned int flags)
{
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
@@ -163,7 +175,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct iov_iter iter;
/* fixedbufs is only for non-vectored io */
- if (WARN_ON_ONCE(vec))
+ if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC))
return -EINVAL;
ret = io_uring_cmd_import_fixed(ubuffer, bufflen,
rq_data_dir(req), &iter, ioucmd);
@@ -172,8 +184,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL);
} else {
ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
- bufflen, GFP_KERNEL, vec, 0, 0,
- rq_data_dir(req));
+ bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0,
+ 0, rq_data_dir(req));
}
if (ret)
@@ -203,9 +215,9 @@ out:
}
static int nvme_submit_user_cmd(struct request_queue *q,
- struct nvme_command *cmd, u64 ubuffer,
- unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, u64 *result, unsigned timeout, bool vec)
+ struct nvme_command *cmd, u64 ubuffer, unsigned bufflen,
+ void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+ u64 *result, unsigned timeout, unsigned int flags)
{
struct nvme_ctrl *ctrl;
struct request *req;
@@ -221,7 +233,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
req->timeout = timeout;
if (ubuffer && bufflen) {
ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
- meta_len, meta_seed, &meta, NULL, vec);
+ meta_len, meta_seed, &meta, NULL, flags);
if (ret)
return ret;
}
@@ -304,10 +316,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.apptag = cpu_to_le16(io.apptag);
c.rw.appmask = cpu_to_le16(io.appmask);
- return nvme_submit_user_cmd(ns->queue, &c,
- io.addr, length,
- metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
- false);
+ return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
+ meta_len, lower_32_bits(io.slba), NULL, 0, 0);
}
static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -325,7 +335,8 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd __user *ucmd, fmode_t mode)
+ struct nvme_passthru_cmd __user *ucmd, unsigned int flags,
+ fmode_t mode)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
@@ -353,16 +364,15 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
- if (!nvme_cmd_allowed(ns, &c, mode))
+ if (!nvme_cmd_allowed(ns, &c, 0, mode))
return -EACCES;
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- cmd.addr, cmd.data_len,
- nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
- 0, &result, timeout, false);
+ cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
+ cmd.metadata_len, 0, &result, timeout, 0);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@@ -373,8 +383,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
}
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd64 __user *ucmd, bool vec,
- fmode_t mode)
+ struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags,
+ fmode_t mode)
{
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
@@ -401,16 +411,15 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
- if (!nvme_cmd_allowed(ns, &c, mode))
+ if (!nvme_cmd_allowed(ns, &c, flags, mode))
return -EACCES;
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- cmd.addr, cmd.data_len,
- nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
- 0, &cmd.result, timeout, vec);
+ cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
+ cmd.metadata_len, 0, &cmd.result, timeout, flags);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -571,7 +580,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
- if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode))
+ if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode))
return -EACCES;
d.metadata = READ_ONCE(cmd->metadata);
@@ -641,9 +650,9 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
{
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp, mode);
+ return nvme_user_cmd(ctrl, NULL, argp, 0, mode);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, false, mode);
+ return nvme_user_cmd64(ctrl, NULL, argp, 0, mode);
default:
return sed_ioctl(ctrl->opal_dev, cmd, argp);
}
@@ -668,14 +677,14 @@ struct nvme_user_io32 {
#endif /* COMPAT_FOR_U64_ALIGNMENT */
static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
- void __user *argp, fmode_t mode)
+ void __user *argp, unsigned int flags, fmode_t mode)
{
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
return ns->head->ns_id;
case NVME_IOCTL_IO_CMD:
- return nvme_user_cmd(ns->ctrl, ns, argp, mode);
+ return nvme_user_cmd(ns->ctrl, ns, argp, flags, mode);
/*
* struct nvme_user_io can have different padding on some 32-bit ABIs.
* Just accept the compat version as all fields that are used are the
@@ -686,37 +695,40 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
#endif
case NVME_IOCTL_SUBMIT_IO:
return nvme_submit_io(ns, argp);
- case NVME_IOCTL_IO64_CMD:
- return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode);
case NVME_IOCTL_IO64_CMD_VEC:
- return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode);
+ flags |= NVME_IOCTL_VEC;
+ fallthrough;
+ case NVME_IOCTL_IO64_CMD:
+ return nvme_user_cmd64(ns->ctrl, ns, argp, flags, mode);
default:
return -ENOTTY;
}
}
-static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg,
- fmode_t mode)
-{
- if (is_ctrl_ioctl(cmd))
- return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode);
- return nvme_ns_ioctl(ns, cmd, arg, mode);
-}
-
int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
+ void __user *argp = (void __user *)arg;
+ unsigned int flags = 0;
- return __nvme_ioctl(ns, cmd, (void __user *)arg, mode);
+ if (bdev_is_partition(bdev))
+ flags |= NVME_IOCTL_PARTITION;
+
+ if (is_ctrl_ioctl(cmd))
+ return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode);
+ return nvme_ns_ioctl(ns, cmd, argp, flags, mode);
}
long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct nvme_ns *ns =
container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
+ void __user *argp = (void __user *)arg;
- return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode);
+ if (is_ctrl_ioctl(cmd))
+ return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, file->f_mode);
+ return nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode);
}
static int nvme_uring_cmd_checks(unsigned int issue_flags)
@@ -806,6 +818,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct nvme_ns *ns;
int srcu_idx, ret = -EWOULDBLOCK;
+ unsigned int flags = 0;
+
+ if (bdev_is_partition(bdev))
+ flags |= NVME_IOCTL_PARTITION;
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
@@ -821,7 +837,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
mode);
- ret = nvme_ns_ioctl(ns, cmd, argp, mode);
+ ret = nvme_ns_ioctl(ns, cmd, argp, flags, mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -846,7 +862,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
file->f_mode);
- ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode);
+ ret = nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -945,7 +961,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
- ret = nvme_user_cmd(ctrl, ns, argp, mode);
+ ret = nvme_user_cmd(ctrl, ns, argp, 0, mode);
nvme_put_ns(ns);
return ret;
@@ -962,9 +978,9 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp, file->f_mode);
+ return nvme_user_cmd(ctrl, NULL, argp, 0, file->f_mode);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode);
+ return nvme_user_cmd64(ctrl, NULL, argp, 0, file->f_mode);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp, file->f_mode);
case NVME_IOCTL_RESET:
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b13baccedb4a..a2553b7d9bb8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2533,7 +2533,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
*/
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
- return result;
+ goto disable;
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -2586,8 +2586,13 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
- return nvme_pci_configure_admin_queue(dev);
+ result = nvme_pci_configure_admin_queue(dev);
+ if (result)
+ goto free_irq;
+ return result;
+ free_irq:
+ pci_free_irq_vectors(pdev);
disable:
pci_disable_device(pdev);
return result;
@@ -3495,7 +3500,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
NVME_QUIRK_128_BYTES_SQES |
NVME_QUIRK_SHARED_TAGS |
- NVME_QUIRK_SKIP_CID_GEN },
+ NVME_QUIRK_SKIP_CID_GEN |
+ NVME_QUIRK_IDENTIFY_CNS },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ 0, }
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 43d4e073b111..b87ed829ab94 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -288,6 +288,7 @@ struct queue_limits {
unsigned int max_dev_sectors;
unsigned int chunk_sectors;
unsigned int max_sectors;
+ unsigned int max_user_sectors;
unsigned int max_segment_size;
unsigned int physical_block_size;
unsigned int logical_block_size;
@@ -1095,11 +1096,12 @@ static inline bool bdev_is_partition(struct block_device *bdev)
enum blk_default_limits {
BLK_MAX_SEGMENTS = 128,
BLK_SAFE_MAX_SECTORS = 255,
- BLK_DEF_MAX_SECTORS = 2560,
BLK_MAX_SEGMENT_SIZE = 65536,
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
};
+#define BLK_DEF_MAX_SECTORS 2560u
+
static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
return q->limits.seg_boundary_mask;
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 128a67a40065..cc0cf0705b8f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -195,11 +195,7 @@ struct io_alloc_cache {
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
- struct percpu_ref refs;
-
- struct io_rings *rings;
unsigned int flags;
- enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -210,6 +206,12 @@ struct io_ring_ctx {
unsigned int syscall_iopoll: 1;
/* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1;
+ unsigned int poll_activated: 1;
+
+ enum task_work_notify_mode notify_method;
+ struct io_rings *rings;
+ struct task_struct *submitter_task;
+ struct percpu_ref refs;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -293,6 +295,7 @@ struct io_ring_ctx {
spinlock_t completion_lock;
bool poll_multi_queue;
+ bool cq_waiting;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
@@ -318,9 +321,8 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* Keep this last, we don't need it for the fast path */
-
+ struct wait_queue_head poll_wq;
struct io_restriction restrictions;
- struct task_struct *submitter_task;
/* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
@@ -357,6 +359,7 @@ struct io_ring_ctx {
u32 iowq_limits[2];
bool iowq_limits_set;
+ struct callback_head poll_wq_task_work;
struct list_head defer_list;
unsigned sq_thread_idle;
/* protected by ->completion_lock */
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 9f158238edba..73b1d5d1e4f1 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -346,6 +346,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
struct iov_iter *i, bool compat);
int import_single_range(int type, void __user *buf, size_t len,
struct iovec *iov, struct iov_iter *i);
+int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
void __user *buf, size_t count)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2780bce62faf..636a4c2c1294 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -347,6 +347,8 @@ enum {
* applicable for IORING_MSG_DATA, obviously.
*/
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
+/* Pass through the flags from sqe->file_index to cqe->flags */
+#define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/*
* IO completion data structure (Completion Queue Entry)
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 2e04850a657b..882bd56b01ed 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -170,12 +170,11 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
+ struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i];
struct io_kiocb *req;
spin_lock(&hb->lock);
@@ -183,8 +182,17 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
task_work_pending(req->task));
spin_unlock(&hb->lock);
+
+ if (!has_lock)
+ continue;
+ hlist_for_each_entry(req, &hbl->list, hash_node)
+ seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
+ task_work_pending(req->task));
}
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
seq_puts(m, "CqOverflowList:\n");
spin_lock(&ctx->completion_lock);
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 992dcd9f8c4c..411bb2d1acd4 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1230,7 +1230,12 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
worker = container_of(cb, struct io_worker, create_work);
io_worker_cancel_cb(worker);
- kfree(worker);
+ /*
+ * Only the worker continuation helper has worker allocated and
+ * hence needs freeing.
+ */
+ if (cb->func == create_worker_cont)
+ kfree(worker);
}
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ac1cd8d23ea..5aab3fa3b7c5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -316,6 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
+ init_waitqueue_head(&ctx->poll_wq);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
@@ -572,6 +573,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
+ if (ctx->poll_activated)
+ io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
if (ctx->drain_active) {
@@ -618,6 +621,25 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
io_cqring_wake(ctx);
}
+static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
+ __releases(ctx->completion_lock)
+{
+ io_commit_cqring(ctx);
+ __io_cq_unlock(ctx);
+ io_commit_cqring_flush(ctx);
+
+ /*
+ * As ->task_complete implies that the ring is single tasked, cq_wait
+ * may only be waited on by the current in io_cqring_wait(), but since
+ * it will re-check the wakeup conditions once we return we can safely
+ * skip waking it up.
+ */
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
+ smp_mb();
+ __io_cqring_wake(ctx);
+ }
+}
+
void io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
@@ -731,7 +753,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
size_t ocq_size = sizeof(struct io_overflow_cqe);
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
- lockdep_assert_held(&ctx->completion_lock);
+ io_lockdep_assert_cq_locked(ctx);
if (is_cqe32)
ocq_size += sizeof(struct io_uring_cqe);
@@ -1241,7 +1263,7 @@ static void io_req_local_work_add(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
return;
}
- /* need it for the following io_cqring_wake() */
+ /* needed for the following wake up */
smp_mb__after_atomic();
if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
@@ -1252,10 +1274,11 @@ static void io_req_local_work_add(struct io_kiocb *req)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-
if (ctx->has_evfd)
io_eventfd_signal(ctx);
- __io_cqring_wake(ctx);
+
+ if (READ_ONCE(ctx->cq_waiting))
+ wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
percpu_ref_put(&ctx->refs);
}
@@ -1296,21 +1319,19 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
}
}
-int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
+static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
{
struct llist_node *node;
- struct llist_node fake;
- struct llist_node *current_final = NULL;
- int ret;
- unsigned int loops = 1;
+ unsigned int loops = 0;
+ int ret = 0;
- if (unlikely(ctx->submitter_task != current))
+ if (WARN_ON_ONCE(ctx->submitter_task != current))
return -EEXIST;
-
- node = io_llist_xchg(&ctx->work_llist, &fake);
- ret = 0;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
again:
- while (node != current_final) {
+ node = io_llist_xchg(&ctx->work_llist, NULL);
+ while (node) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1319,26 +1340,17 @@ again:
ret++;
node = next;
}
+ loops++;
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-
- node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL);
- if (node != &fake) {
- loops++;
- current_final = &fake;
- node = io_llist_xchg(&ctx->work_llist, &fake);
+ if (!llist_empty(&ctx->work_llist))
goto again;
- }
-
if (*locked)
io_submit_flush_completions(ctx);
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
-
}
-int io_run_local_work(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
{
bool locked;
int ret;
@@ -1346,8 +1358,19 @@ int io_run_local_work(struct io_ring_ctx *ctx)
if (llist_empty(&ctx->work_llist))
return 0;
- __set_current_state(TASK_RUNNING);
- locked = mutex_trylock(&ctx->uring_lock);
+ locked = true;
+ ret = __io_run_local_work(ctx, &locked);
+ /* shouldn't happen! */
+ if (WARN_ON_ONCE(!locked))
+ mutex_lock(&ctx->uring_lock);
+ return ret;
+}
+
+static int io_run_local_work(struct io_ring_ctx *ctx)
+{
+ bool locked = mutex_trylock(&ctx->uring_lock);
+ int ret;
+
ret = __io_run_local_work(ctx, &locked);
if (locked)
mutex_unlock(&ctx->uring_lock);
@@ -1467,7 +1490,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}
}
}
- __io_cq_unlock_post(ctx);
+ __io_cq_unlock_post_flush(ctx);
if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
@@ -2420,13 +2443,13 @@ struct io_wait_queue {
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
+ ktime_t timeout;
};
static inline bool io_has_work(struct io_ring_ctx *ctx)
{
return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
- ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
- !llist_empty(&ctx->work_llist));
+ !llist_empty(&ctx->work_llist);
}
static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -2445,22 +2468,25 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
int wake_flags, void *key)
{
- struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
- wq);
- struct io_ring_ctx *ctx = iowq->ctx;
+ struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
/*
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || io_has_work(ctx))
+ if (io_should_wake(iowq) || io_has_work(iowq->ctx))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
- if (io_run_task_work_ctx(ctx) > 0)
+ if (!llist_empty(&ctx->work_llist)) {
+ __set_current_state(TASK_RUNNING);
+ if (io_run_local_work(ctx) > 0)
+ return 1;
+ }
+ if (io_run_task_work() > 0)
return 1;
if (task_sigpending(current))
return -EINTR;
@@ -2469,35 +2495,23 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq,
- ktime_t *timeout)
+ struct io_wait_queue *iowq)
{
- int ret;
- unsigned long check_cq;
-
- /* make sure we run task_work before checking for signals */
- ret = io_run_task_work_sig(ctx);
- if (ret || io_should_wake(iowq))
- return ret;
-
- check_cq = READ_ONCE(ctx->check_cq);
- if (unlikely(check_cq)) {
- /* let the caller flush overflows, retry */
- if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
- return 1;
- if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
- return -EBADR;
- }
- if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
+ if (unlikely(READ_ONCE(ctx->check_cq)))
+ return 1;
+ if (unlikely(!llist_empty(&ctx->work_llist)))
+ return 1;
+ if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
+ return 1;
+ if (unlikely(task_sigpending(current)))
+ return -EINTR;
+ if (unlikely(io_should_wake(iowq)))
+ return 0;
+ if (iowq->timeout == KTIME_MAX)
+ schedule();
+ else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
return -ETIME;
-
- /*
- * Run task_work after scheduling. If we got woken because of
- * task_work being processed, run it now rather than let the caller
- * do another wait loop.
- */
- ret = io_run_task_work_sig(ctx);
- return ret < 0 ? ret : 1;
+ return 0;
}
/*
@@ -2510,23 +2524,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- ktime_t timeout = KTIME_MAX;
int ret;
if (!io_allowed_run_tw(ctx))
return -EEXIST;
-
- do {
- /* always run at least 1 task work to process local work */
- ret = io_run_task_work_ctx(ctx);
- if (ret < 0)
- return ret;
- io_cqring_overflow_flush(ctx);
-
- /* if user messes with these they will just get an early return */
- if (__io_cqring_events_user(ctx) >= min_events)
- return 0;
- } while (ret > 0);
+ if (!llist_empty(&ctx->work_llist))
+ io_run_local_work(ctx);
+ io_run_task_work();
+ io_cqring_overflow_flush(ctx);
+ /* if user messes with these they will just get an early return */
+ if (__io_cqring_events_user(ctx) >= min_events)
+ return 0;
if (sig) {
#ifdef CONFIG_COMPAT
@@ -2541,36 +2549,69 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
iowq.ctx = ctx;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+ iowq.timeout = KTIME_MAX;
+
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
trace_io_uring_cqring_wait(ctx, min_events);
do {
- if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
- finish_wait(&ctx->cq_wait, &iowq.wq);
- io_cqring_do_overflow_flush(ctx);
+ unsigned long check_cq;
+
+ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+ WRITE_ONCE(ctx->cq_waiting, 1);
+ set_current_state(TASK_INTERRUPTIBLE);
+ } else {
+ prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
+ TASK_INTERRUPTIBLE);
+ }
+
+ ret = io_cqring_wait_schedule(ctx, &iowq);
+ __set_current_state(TASK_RUNNING);
+ WRITE_ONCE(ctx->cq_waiting, 0);
+
+ if (ret < 0)
+ break;
+ /*
+ * Run task_work after scheduling and before io_should_wake().
+ * If we got woken because of task_work being processed, run it
+ * now rather than let the caller do another wait loop.
+ */
+ io_run_task_work();
+ if (!llist_empty(&ctx->work_llist))
+ io_run_local_work(ctx);
+
+ check_cq = READ_ONCE(ctx->check_cq);
+ if (unlikely(check_cq)) {
+ /* let the caller flush overflows, retry */
+ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
+ io_cqring_do_overflow_flush(ctx);
+ if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
+ ret = -EBADR;
+ break;
+ }
}
- prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
- TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
- if (__io_cqring_events_user(ctx) >= min_events)
+
+ if (io_should_wake(&iowq)) {
+ ret = 0;
break;
+ }
cond_resched();
- } while (ret > 0);
+ } while (1);
- finish_wait(&ctx->cq_wait, &iowq.wq);
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ finish_wait(&ctx->cq_wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
@@ -2764,12 +2805,54 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
kfree(ctx);
}
+static __cold void io_activate_pollwq_cb(struct callback_head *cb)
+{
+ struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
+ poll_wq_task_work);
+
+ mutex_lock(&ctx->uring_lock);
+ ctx->poll_activated = true;
+ mutex_unlock(&ctx->uring_lock);
+
+ /*
+ * Wake ups for some events between start of polling and activation
+ * might've been lost due to loose synchronisation.
+ */
+ wake_up_all(&ctx->poll_wq);
+ percpu_ref_put(&ctx->refs);
+}
+
+static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+{
+ spin_lock(&ctx->completion_lock);
+ /* already activated or in progress */
+ if (ctx->poll_activated || ctx->poll_wq_task_work.func)
+ goto out;
+ if (WARN_ON_ONCE(!ctx->task_complete))
+ goto out;
+ if (!ctx->submitter_task)
+ goto out;
+ /*
+ * with ->submitter_task only the submitter task completes requests, we
+ * only need to sync with it, which is done by injecting a tw
+ */
+ init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
+ percpu_ref_get(&ctx->refs);
+ if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
+ percpu_ref_put(&ctx->refs);
+out:
+ spin_unlock(&ctx->completion_lock);
+}
+
static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->cq_wait, wait);
+ if (unlikely(!ctx->poll_activated))
+ io_activate_pollwq(ctx);
+
+ poll_wait(file, &ctx->poll_wq, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -3058,7 +3141,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
}
}
- if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+ if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
+ io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
@@ -3575,6 +3659,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->task_complete = true;
/*
+ * lazy poll_wq activation relies on ->task_complete for synchronisation
+ * purposes, see io_activate_pollwq()
+ */
+ if (!ctx->task_complete)
+ ctx->poll_activated = true;
+
+ /*
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
* space applications don't need to do io completion events
* polling again, they can rely on io_sq_thread to do polling
@@ -3867,8 +3958,15 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
- if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task)
+ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
ctx->submitter_task = get_task_struct(current);
+ /*
+ * Lazy activation attempts would fail if it was polled before
+ * submitter_task is set.
+ */
+ if (wq_has_sleeper(&ctx->poll_wq))
+ io_activate_pollwq(ctx);
+ }
if (ctx->restrictions.registered)
ctx->restricted = 1;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ab4b2a1c3b7e..5113e0ddb01d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -28,8 +28,6 @@ enum {
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
bool io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
-int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
-int io_run_local_work(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
@@ -222,6 +220,13 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
+{
+ if (wq_has_sleeper(&ctx->poll_wq))
+ __wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
+ poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+}
+
/* requires smb_mb() prior, see wq_has_sleeper() */
static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
{
@@ -284,42 +289,6 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
}
-static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
-{
- int ret = 0;
- int ret2;
-
- if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
- ret = io_run_local_work(ctx);
-
- /* want to run this after in case more is added */
- ret2 = io_run_task_work();
-
- /* Try propagate error in favour of if tasks were run,
- * but still make sure to run them if requested
- */
- if (ret >= 0)
- ret += ret2;
-
- return ret;
-}
-
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
-{
- bool locked;
- int ret;
-
- if (llist_empty(&ctx->work_llist))
- return 0;
-
- locked = true;
- ret = __io_run_local_work(ctx, &locked);
- /* shouldn't happen! */
- if (WARN_ON_ONCE(!locked))
- mutex_lock(&ctx->uring_lock);
- return ret;
-}
-
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
@@ -345,7 +314,8 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
@@ -387,6 +357,11 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
return container_of(node, struct io_kiocb, comp_list);
}
+static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
+{
+ return likely(ctx->submitter_task == current);
+}
+
static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
{
return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) ||
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 2d3cd945a531..dc233f72a541 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -13,6 +13,11 @@
#include "filetable.h"
#include "msg_ring.h"
+
+/* All valid masks for MSG_RING */
+#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
+ IORING_MSG_RING_FLAGS_PASS)
+
struct io_msg {
struct file *file;
struct file *src_file;
@@ -21,7 +26,10 @@ struct io_msg {
u32 len;
u32 cmd;
u32 src_fd;
- u32 dst_fd;
+ union {
+ u32 dst_fd;
+ u32 cqe_flags;
+ };
u32 flags;
};
@@ -57,10 +65,17 @@ static int io_msg_ring_data(struct io_kiocb *req)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+ u32 flags = 0;
- if (msg->src_fd || msg->dst_fd || msg->flags)
+ if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS)
return -EINVAL;
+ if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd)
+ return -EINVAL;
+
+ if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
+ flags = msg->cqe_flags;
+
if (target_ctx->task_complete && current != target_ctx->submitter_task) {
init_task_work(&msg->tw, io_msg_tw_complete);
if (task_work_add(target_ctx->submitter_task, &msg->tw,
@@ -71,7 +86,7 @@ static int io_msg_ring_data(struct io_kiocb *req)
return IOU_ISSUE_SKIP_COMPLETE;
}
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
return 0;
return -EOVERFLOW;
@@ -207,7 +222,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
msg->src_fd = READ_ONCE(sqe->addr3);
msg->dst_fd = READ_ONCE(sqe->file_index);
msg->flags = READ_ONCE(sqe->msg_ring_flags);
- if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+ if (msg->flags & ~IORING_MSG_RING_MASK)
return -EINVAL;
return 0;
diff --git a/io_uring/net.c b/io_uring/net.c
index fbc34a7c2743..0fb9dcab20b7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -181,7 +181,7 @@ static int io_setup_async_msg(struct io_kiocb *req,
if (async_msg->msg.msg_name)
async_msg->msg.msg_name = &async_msg->addr;
/* if were using fast_iov, set it to the new one */
- if (!kmsg->free_iov) {
+ if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
}
@@ -344,7 +344,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address;
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
- struct iovec iov;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -378,7 +377,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- ret = import_single_range(ITER_SOURCE, sr->buf, sr->len, &iov, &msg.msg_iter);
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
if (unlikely(ret))
return ret;
@@ -764,10 +763,7 @@ retry_multishot:
}
}
- kmsg->fast_iov[0].iov_base = buf;
- kmsg->fast_iov[0].iov_len = len;
- iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, kmsg->fast_iov, 1,
- len);
+ iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
}
flags = sr->msg_flags;
@@ -835,7 +831,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
struct socket *sock;
- struct iovec iov;
unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
@@ -863,7 +858,7 @@ retry_multishot:
sr->buf = buf;
}
- ret = import_single_range(ITER_DEST, sr->buf, len, &iov, &msg.msg_iter);
+ ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
@@ -1074,7 +1069,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address;
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
- struct iovec iov;
struct socket *sock;
unsigned msg_flags;
int ret, min_ret = 0;
@@ -1116,8 +1110,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg.sg_from_iter = io_sg_from_iter;
} else {
io_notif_set_extended(zc->notif);
- ret = import_single_range(ITER_SOURCE, zc->buf, zc->len, &iov,
- &msg.msg_iter);
+ ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
if (unlikely(ret))
return ret;
ret = io_notif_account_mem(zc->notif, zc->len);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index ee7da6150ec4..32e5fc8365e6 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -223,21 +223,22 @@ enum {
IOU_POLL_DONE = 0,
IOU_POLL_NO_ACTION = 1,
IOU_POLL_REMOVE_POLL_USE_RES = 2,
+ IOU_POLL_REISSUE = 3,
};
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
*
- * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
- * which is either spurious wakeup or multishot CQE is served.
- * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
- * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result
- * is stored in req->cqe.
+ * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action
+ * require, which is either spurious wakeup or multishot CQE is served.
+ * IOU_POLL_DONE when it's done with the request, then the mask is stored in
+ * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
+ * poll and that the result is stored in req->cqe.
*/
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
- int v, ret;
+ int v;
/* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
@@ -276,10 +277,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
+ /*
+ * We got woken with a mask, but someone else got to
+ * it first. The above vfs_poll() doesn't add us back
+ * to the waitqueue, so if we get nothing back, we
+ * should be safe and attempt a reissue.
+ */
+ if (unlikely(!req->cqe.res))
+ return IOU_POLL_REISSUE;
}
-
- if ((unlikely(!req->cqe.res)))
- continue;
if (req->apoll_events & EPOLLONESHOT)
return IOU_POLL_DONE;
@@ -294,7 +300,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
return IOU_POLL_REMOVE_POLL_USE_RES;
}
} else {
- ret = io_poll_issue(req, locked);
+ int ret = io_poll_issue(req, locked);
if (ret == IOU_STOP_MULTISHOT)
return IOU_POLL_REMOVE_POLL_USE_RES;
if (ret < 0)
@@ -330,6 +336,9 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
poll = io_kiocb_to_cmd(req, struct io_poll);
req->cqe.res = mangle_poll(req->cqe.res & poll->events);
+ } else if (ret == IOU_POLL_REISSUE) {
+ io_req_task_submit(req, locked);
+ return;
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret;
req_set_fail(req);
@@ -342,7 +351,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
io_req_task_complete(req, locked);
- else if (ret == IOU_POLL_DONE)
+ else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
io_req_task_submit(req, locked);
else
io_req_defer_failed(req, ret);
@@ -533,6 +542,14 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req,
return pt->owning || io_poll_get_ownership(req);
}
+static void io_poll_add_hash(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_HASH_LOCKED)
+ io_poll_req_insert_locked(req);
+ else
+ io_poll_req_insert(req);
+}
+
/*
* Returns 0 when it's handed over for polling. The caller owns the requests if
* it returns non-zero, but otherwise should not touch it. Negative values
@@ -591,18 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask &&
((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
- if (!io_poll_can_finish_inline(req, ipt))
+ if (!io_poll_can_finish_inline(req, ipt)) {
+ io_poll_add_hash(req);
return 0;
+ }
io_poll_remove_entries(req);
ipt->result_mask = mask;
/* no one else has access to the req, forget about the ref */
return 1;
}
- if (req->flags & REQ_F_HASH_LOCKED)
- io_poll_req_insert_locked(req);
- else
- io_poll_req_insert(req);
+ io_poll_add_hash(req);
if (mask && (poll->events & EPOLLET) &&
io_poll_can_finish_inline(req, ipt)) {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 8227af2e1c0f..908175d844e1 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -391,7 +391,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
rw->len = sqe_len;
}
- ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
+ ret = import_ubuf(ddir, buf, sqe_len, iter);
if (ret)
return ERR_PTR(ret);
return NULL;
@@ -410,7 +410,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req,
unsigned int issue_flags)
{
*iovec = __io_import_iovec(rw, req, s, issue_flags);
- if (unlikely(IS_ERR(*iovec)))
+ if (IS_ERR(*iovec))
return PTR_ERR(*iovec);
iov_iter_save_state(&s->iter, &s->iter_state);
@@ -450,7 +450,10 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
struct iovec iovec;
ssize_t nr;
- if (!iov_iter_is_bvec(iter)) {
+ if (iter_is_ubuf(iter)) {
+ iovec.iov_base = iter->ubuf + iter->iov_offset;
+ iovec.iov_len = iov_iter_count(iter);
+ } else if (!iov_iter_is_bvec(iter)) {
iovec = iov_iter_iovec(iter);
} else {
iovec.iov_base = u64_to_user_ptr(rw->addr);
@@ -495,7 +498,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
io->free_iovec = iovec;
io->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
- if (iov_iter_is_bvec(iter))
+ if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
return;
if (!iovec) {
unsigned iov_off = 0;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f9a3ff37ecd1..d9b3332c8405 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1877,6 +1877,17 @@ int import_single_range(int rw, void __user *buf, size_t len,
}
EXPORT_SYMBOL(import_single_range);
+int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
+{
+ if (len > MAX_RW_COUNT)
+ len = MAX_RW_COUNT;
+ if (unlikely(!access_ok(buf, len)))
+ return -EFAULT;
+
+ iov_iter_ubuf(i, rw, buf, len);
+ return 0;
+}
+
/**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called.
@@ -1891,8 +1902,8 @@ EXPORT_SYMBOL(import_single_range);
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
- if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
- !iov_iter_is_kvec(i) && !iter_is_ubuf(i))
+ if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
+ !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;