From e2033e33cb3821c26d4f9e70677910827d3b7885 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20B=C3=BChler?= <source@stbuehler.de>
Date: Sat, 11 May 2019 19:08:01 +0200
Subject: io_uring: fix race condition reading SQE data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When punting to workers the SQE gets copied after the initial try.
There is a race condition between reading SQE data for the initial try
and copying it for punting it to the workers.

For example io_rw_done calls kiocb->ki_complete even if it was prepared
for IORING_OP_FSYNC (and would be NULL).

The easiest solution for now is to alway prepare again in the worker.

req->file is safe to prepare though as long as it is checked before use.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 48ea3977012a..576d9c652b4c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,9 +329,8 @@ struct io_kiocb {
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
-#define REQ_F_PREPPED		16	/* prep already done */
-#define REQ_F_IO_DRAIN		32	/* drain existing IO first */
-#define REQ_F_IO_DRAINED	64	/* drain done */
+#define REQ_F_IO_DRAIN		16	/* drain existing IO first */
+#define REQ_F_IO_DRAINED	32	/* drain done */
 	u64			user_data;
 	u32			error;	/* iopoll result from callback */
 	u32			sequence;
@@ -896,9 +895,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 
 	if (!req->file)
 		return -EBADF;
-	/* For -EAGAIN retry, everything is already prepped */
-	if (req->flags & REQ_F_PREPPED)
-		return 0;
 
 	if (force_nonblock && !io_file_supports_async(req->file))
 		force_nonblock = false;
@@ -941,7 +937,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 			return -EINVAL;
 		kiocb->ki_complete = io_complete_rw;
 	}
-	req->flags |= REQ_F_PREPPED;
 	return 0;
 }
 
@@ -1227,16 +1222,12 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (!req->file)
 		return -EBADF;
-	/* Prep already done (EAGAIN retry) */
-	if (req->flags & REQ_F_PREPPED)
-		return 0;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
-	req->flags |= REQ_F_PREPPED;
 	return 0;
 }
 
@@ -1277,16 +1268,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (!req->file)
 		return -EBADF;
-	/* Prep already done (EAGAIN retry) */
-	if (req->flags & REQ_F_PREPPED)
-		return 0;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
-	req->flags |= REQ_F_PREPPED;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 44a9bd18a0f06bba19d155aeaa11e2edce898293 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 May 2019 20:00:30 -0600
Subject: io_uring: fix failure to verify SQ_AFF cpu

The test case we have is rightfully failing with the current kernel:

io_uring_setup(1, 0x7ffe2cafebe0), flags: IORING_SETUP_SQPOLL|IORING_SETUP_SQ_AFF, resv: 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000, sq_thread_cpu: 4
expected -1, got 3

This is in a vm, and CPU3 is the last valid one, hence asking for 4
should fail the setup with -EINVAL, not succeed. The problem is that
we're using array_index_nospec() with nr_cpu_ids as the index, hence we
wrap and end up using CPU0 instead of CPU4. This makes the setup
succeed where it should be failing.

We don't need to use array_index_nospec() as we're not indexing any
array with this. Instead just compare with nr_cpu_ids directly. This
is fine as we're checking with cpu_online() afterwards.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 576d9c652b4c..249a1e4e60e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2454,10 +2454,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			ctx->sq_thread_idle = HZ;
 
 		if (p->flags & IORING_SETUP_SQ_AFF) {
-			int cpu = array_index_nospec(p->sq_thread_cpu,
-							nr_cpu_ids);
+			int cpu = p->sq_thread_cpu;
 
 			ret = -EINVAL;
+			if (cpu >= nr_cpu_ids)
+				goto err;
 			if (!cpu_online(cpu))
 				goto err;
 
-- 
cgit v1.2.3


From c71ffb673cd9bb2ddc575ede9055f265b2535690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 May 2019 20:58:29 -0600
Subject: io_uring: remove 'ev_flags' argument

We always pass in 0 for the cqe flags argument, since the support for
"this read hit page cache" hint was dropped.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 249a1e4e60e6..ac0407693834 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -489,7 +489,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 }
 
 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
-				 long res, unsigned ev_flags)
+				 long res)
 {
 	struct io_uring_cqe *cqe;
 
@@ -502,7 +502,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	if (cqe) {
 		WRITE_ONCE(cqe->user_data, ki_user_data);
 		WRITE_ONCE(cqe->res, res);
-		WRITE_ONCE(cqe->flags, ev_flags);
+		WRITE_ONCE(cqe->flags, 0);
 	} else {
 		unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
 
@@ -521,12 +521,12 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 }
 
 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
-				long res, unsigned ev_flags)
+				long res)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(ctx, user_data, res, ev_flags);
+	io_cqring_fill_event(ctx, user_data, res);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -628,7 +628,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
 
-		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+		io_cqring_fill_event(ctx, req->user_data, req->error);
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs)) {
@@ -776,7 +776,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 
 	kiocb_end_write(kiocb);
 
-	io_cqring_add_event(req->ctx, req->user_data, res, 0);
+	io_cqring_add_event(req->ctx, req->user_data, res);
 	io_put_req(req);
 }
 
@@ -1211,7 +1211,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	io_cqring_add_event(ctx, user_data, err, 0);
+	io_cqring_add_event(ctx, user_data, err);
 	io_put_req(req);
 	return 0;
 }
@@ -1256,7 +1256,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
 }
@@ -1300,7 +1300,7 @@ static int io_sync_file_range(struct io_kiocb *req,
 
 	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
 
-	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
 }
@@ -1358,7 +1358,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
 }
@@ -1367,7 +1367,7 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			     __poll_t mask)
 {
 	req->poll.done = true;
-	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
+	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
 	io_commit_cqring(ctx);
 }
 
@@ -1687,7 +1687,7 @@ restart:
 		io_put_req(req);
 
 		if (ret) {
-			io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+			io_cqring_add_event(ctx, sqe->user_data, ret);
 			io_put_req(req);
 		}
 
@@ -1992,7 +1992,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
 			continue;
 		}
 
-		io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+		io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret);
 	}
 
 	if (statep)
@@ -2157,7 +2157,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 		ret = io_submit_sqe(ctx, &s, statep);
 		if (ret)
-			io_cqring_add_event(ctx, s.sqe->user_data, ret, 0);
+			io_cqring_add_event(ctx, s.sqe->user_data, ret);
 	}
 	io_commit_sqring(ctx);
 
-- 
cgit v1.2.3


From 2bbcd6d3b36a75a19be4917807f54ae32dd26aba Mon Sep 17 00:00:00 2001
From: Roman Penyaev <rpenyaev@suse.de>
Date: Thu, 16 May 2019 10:53:57 +0200
Subject: io_uring: fix infinite wait in khread_park() on io_finish_async()

This fixes couple of races which lead to infinite wait of park completion
with the following backtraces:

  [20801.303319] Call Trace:
  [20801.303321]  ? __schedule+0x284/0x650
  [20801.303323]  schedule+0x33/0xc0
  [20801.303324]  schedule_timeout+0x1bc/0x210
  [20801.303326]  ? schedule+0x3d/0xc0
  [20801.303327]  ? schedule_timeout+0x1bc/0x210
  [20801.303329]  ? preempt_count_add+0x79/0xb0
  [20801.303330]  wait_for_completion+0xa5/0x120
  [20801.303331]  ? wake_up_q+0x70/0x70
  [20801.303333]  kthread_park+0x48/0x80
  [20801.303335]  io_finish_async+0x2c/0x70
  [20801.303336]  io_ring_ctx_wait_and_kill+0x95/0x180
  [20801.303338]  io_uring_release+0x1c/0x20
  [20801.303339]  __fput+0xad/0x210
  [20801.303341]  task_work_run+0x8f/0xb0
  [20801.303342]  exit_to_usermode_loop+0xa0/0xb0
  [20801.303343]  do_syscall_64+0xe0/0x100
  [20801.303349]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

  [20801.303380] Call Trace:
  [20801.303383]  ? __schedule+0x284/0x650
  [20801.303384]  schedule+0x33/0xc0
  [20801.303386]  io_sq_thread+0x38a/0x410
  [20801.303388]  ? __switch_to_asm+0x40/0x70
  [20801.303390]  ? wait_woken+0x80/0x80
  [20801.303392]  ? _raw_spin_lock_irqsave+0x17/0x40
  [20801.303394]  ? io_submit_sqes+0x120/0x120
  [20801.303395]  kthread+0x112/0x130
  [20801.303396]  ? kthread_create_on_node+0x60/0x60
  [20801.303398]  ret_from_fork+0x35/0x40

 o kthread_park() waits for park completion, so io_sq_thread() loop
   should check kthread_should_park() along with khread_should_stop(),
   otherwise if kthread_park() is called before prepare_to_wait()
   the following schedule() never returns:

   CPU#0                    CPU#1

   io_sq_thread_stop():     io_sq_thread():

                               while(!kthread_should_stop() && !ctx->sqo_stop) {

      ctx->sqo_stop = 1;
      kthread_park()

	                            prepare_to_wait();
                                    if (kthread_should_stop() {
				    }
                                    schedule();   <<< nobody checks park flag,
				                  <<< so schedule and never return

 o if the flag ctx->sqo_stop is observed by the io_sq_thread() loop
   it is quite possible, that kthread_should_park() check and the
   following kthread_parkme() is never called, because kthread_park()
   has not been yet called, but few moments later is is called and
   waits there for park completion, which never happens, because
   kthread has already exited:

   CPU#0                    CPU#1

   io_sq_thread_stop():     io_sq_thread():

      ctx->sqo_stop = 1;
                               while(!kthread_should_stop() && !ctx->sqo_stop) {
                                   <<< observe sqo_stop and exit the loop
			       }

			       if (kthread_should_park())
			           kthread_parkme();  <<< never called, since was
					              <<< never parked

      kthread_park()           <<< waits forever for park completion

In the current patch we quit the loop by only kthread_should_park()
check (kthread_park() is synchronous, so kthread_should_stop() is
never observed), and we abandon ->sqo_stop flag, since it is racy.
At the end of the io_sq_thread() we unconditionally call parmke(),
since we've exited the loop by the park flag.

Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ac0407693834..67d1aae349d7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -231,7 +231,6 @@ struct io_ring_ctx {
 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
 	wait_queue_head_t	sqo_wait;
-	unsigned		sqo_stop;
 
 	struct {
 		/* CQ ring */
@@ -2015,7 +2014,7 @@ static int io_sq_thread(void *data)
 	set_fs(USER_DS);
 
 	timeout = inflight = 0;
-	while (!kthread_should_stop() && !ctx->sqo_stop) {
+	while (!kthread_should_park()) {
 		bool all_fixed, mm_fault = false;
 		int i;
 
@@ -2077,7 +2076,7 @@ static int io_sq_thread(void *data)
 			smp_mb();
 
 			if (!io_get_sqring(ctx, &sqes[0])) {
-				if (kthread_should_stop()) {
+				if (kthread_should_park()) {
 					finish_wait(&ctx->sqo_wait, &wait);
 					break;
 				}
@@ -2127,8 +2126,7 @@ static int io_sq_thread(void *data)
 		mmput(cur_mm);
 	}
 
-	if (kthread_should_park())
-		kthread_parkme();
+	kthread_parkme();
 
 	return 0;
 }
@@ -2260,8 +2258,11 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_thread) {
-		ctx->sqo_stop = 1;
-		mb();
+		/*
+		 * The park is a bit of a work-around, without it we get
+		 * warning spews on shutdown with SQPOLL set and affinity
+		 * set to a single CPU.
+		 */
 		kthread_park(ctx->sqo_thread);
 		kthread_stop(ctx->sqo_thread);
 		ctx->sqo_thread = NULL;
-- 
cgit v1.2.3


From dc6ce4bc2b355a47f225a0205046b3ebf29a7f72 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Thu, 16 May 2019 11:46:30 +0800
Subject: io_uring: adjust smp_rmb inside io_cqring_events

Whenever smp_rmb is required to use io_cqring_events,
keep smp_rmb inside the function io_cqring_events.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 67d1aae349d7..9cc7a101ef2a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2167,6 +2167,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 static unsigned io_cqring_events(struct io_cq_ring *ring)
 {
+	/* See comment at the top of this file */
+	smp_rmb();
 	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
 }
 
@@ -2182,8 +2184,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	DEFINE_WAIT(wait);
 	int ret;
 
-	/* See comment at the top of this file */
-	smp_rmb();
 	if (io_cqring_events(ring) >= min_events)
 		return 0;
 
@@ -2205,8 +2205,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
 
 		ret = 0;
-		/* See comment at the top of this file */
-		smp_rmb();
 		if (io_cqring_events(ring) >= min_events)
 			break;
 
-- 
cgit v1.2.3


From fdb288a679cdf6a71f3c1ae6f348ba4dae742681 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Thu, 16 May 2019 11:46:31 +0800
Subject: io_uring: use wait_event_interruptible for cq_wait conditional wait

The previous patch has ensured that io_cqring_events contain
smp_rmb memory barriers, Now we can use wait_event_interruptible
to keep the code simple.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9cc7a101ef2a..383d208ca0d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2181,7 +2181,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
 	sigset_t ksigmask, sigsaved;
-	DEFINE_WAIT(wait);
 	int ret;
 
 	if (io_cqring_events(ring) >= min_events)
@@ -2201,21 +2200,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	do {
-		prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
-
-		ret = 0;
-		if (io_cqring_events(ring) >= min_events)
-			break;
-
-		schedule();
-
+	ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
+	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
-		if (signal_pending(current))
-			break;
-	} while (1);
-
-	finish_wait(&ctx->wait, &wait);
 
 	if (sig)
 		restore_user_sigmask(sig, &sigsaved);
-- 
cgit v1.2.3