diff options
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 360 |
1 files changed, 245 insertions, 115 deletions
@@ -20,7 +20,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> @@ -42,7 +42,7 @@ #include <linux/mount.h> #include <asm/kmap_types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup @@ -269,14 +274,17 @@ __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; + struct address_space *i_mapping; + if (aio_ring_file) { - truncate_setsize(aio_ring_file->f_inode, 0); + truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ - spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); - aio_ring_file->f_inode->i_mapping->private_data = NULL; + i_mapping = aio_ring_file->f_mapping; + spin_lock(&i_mapping->private_lock); + i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } @@ -474,7 +482,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) for (i = 0; i < nr_pages; i++) { struct page *page; - page = find_or_create_page(file->f_inode->i_mapping, + page = find_or_create_page(file->f_mapping, i, GFP_HIGHUSER | __GFP_ZERO); if (!page) break; @@ -495,10 +503,15 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + ctx->mmap_size = 0; + aio_free_ring(ctx); + return -EINTR; + } + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0, &unused); + MAP_SHARED, 0, &unused, NULL); up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; @@ -1070,6 +1083,18 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) unsigned tail, pos, head; unsigned long flags; + if (kiocb->ki_flags & IOCB_WRITE) { + struct file *file = kiocb->ki_filp; + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + file_end_write(file); + } + /* * Special case handling for sync iocbs: * - events go directly into the iocb for fast handling @@ -1266,7 +1291,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, struct timespec __user *timeout) { - ktime_t until = { .tv64 = KTIME_MAX }; + ktime_t until = KTIME_MAX; long ret = 0; if (timeout) { @@ -1292,7 +1317,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until.tv64 == 0) + if (until == 0) aio_read_events(ctx, min_nr, nr, event, &ret); else wait_event_interruptible_hrtimeout(ctx->wait, @@ -1348,6 +1373,39 @@ out: return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctx32p); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + /* truncating is ok because it's a user address */ + ret = put_user((u32)ioctx->user_id, ctx32p); + if (ret) + kill_ioctx(current->mm, ioctx, NULL); + percpu_ref_put(&ioctx->users); + } + +out: + return ret; +} +#endif + /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not @@ -1384,128 +1442,111 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } -typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); - -static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len, - struct iovec **iovec, - bool compat, - struct iov_iter *iter) +static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, + bool vectored, bool compat, struct iov_iter *iter) { + void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; + size_t len = iocb->aio_nbytes; + + if (!vectored) { + ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); + *iovec = NULL; + return ret; + } #ifdef CONFIG_COMPAT if (compat) - return compat_import_iovec(rw, - (struct compat_iovec __user *)buf, - len, UIO_FASTIOV, iovec, iter); + return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec, + iter); #endif - return import_iovec(rw, (struct iovec __user *)buf, - len, UIO_FASTIOV, iovec, iter); + return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); } -/* - * aio_run_iocb: - * Performs the initial checks and io submission. - */ -static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, size_t len, bool compat) +static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + return ret; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + /*FALLTHRU*/ + default: + aio_complete(req, ret, 0); + return 0; + } +} + +static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, + bool compat) { struct file *file = req->ki_filp; - ssize_t ret; - int rw; - fmode_t mode; - rw_iter_op *iter_op; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + ssize_t ret; - switch (opcode) { - case IOCB_CMD_PREAD: - case IOCB_CMD_PREADV: - mode = FMODE_READ; - rw = READ; - iter_op = file->f_op->read_iter; - goto rw_common; - - case IOCB_CMD_PWRITE: - case IOCB_CMD_PWRITEV: - mode = FMODE_WRITE; - rw = WRITE; - iter_op = file->f_op->write_iter; - goto rw_common; -rw_common: - if (unlikely(!(file->f_mode & mode))) - return -EBADF; - - if (!iter_op) - return -EINVAL; - - if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) - ret = aio_setup_vectored_rw(rw, buf, len, - &iovec, compat, &iter); - else { - ret = import_single_range(rw, buf, len, iovec, &iter); - iovec = NULL; - } - if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, - iov_iter_count(&iter)); - if (ret < 0) { - kfree(iovec); - return ret; - } - - len = ret; - - if (rw == WRITE) - file_start_write(file); - - ret = iter_op(req, &iter); - - if (rw == WRITE) - file_end_write(file); - kfree(iovec); - break; - - case IOCB_CMD_FDSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; - - ret = file->f_op->aio_fsync(req, 1); - break; + if (unlikely(!(file->f_mode & FMODE_READ))) + return -EBADF; + if (unlikely(!file->f_op->read_iter)) + return -EINVAL; - case IOCB_CMD_FSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; + ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); + if (ret) + return ret; + ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); + if (!ret) + ret = aio_ret(req, call_read_iter(file, req, &iter)); + kfree(iovec); + return ret; +} - ret = file->f_op->aio_fsync(req, 0); - break; +static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, + bool compat) +{ + struct file *file = req->ki_filp; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; - default: - pr_debug("EINVAL: no operation provided\n"); + if (unlikely(!(file->f_mode & FMODE_WRITE))) + return -EBADF; + if (unlikely(!file->f_op->write_iter)) return -EINVAL; - } - if (ret != -EIOCBQUEUED) { + ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); + if (ret) + return ret; + ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); + if (!ret) { + req->ki_flags |= IOCB_WRITE; + file_start_write(file); + ret = aio_ret(req, call_write_iter(file, req, &iter)); /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. + * We release freeze protection in aio_complete(). Fool lockdep + * by telling it the lock got released so that it doesn't + * complain about held lock when we return to userspace. */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || - ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(req, ret, 0); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } - - return 0; + kfree(iovec); + return ret; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { struct aio_kiocb *req; + struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1524,7 +1565,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->common.ki_filp = fget(iocb->aio_fildes); + req->common.ki_filp = file = fget(iocb->aio_fildes); if (unlikely(!req->common.ki_filp)) { ret = -EBADF; goto out_put_req; @@ -1532,6 +1573,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1550,6 +1592,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); @@ -1559,13 +1613,29 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_user_iocb = user_iocb; req->ki_user_data = iocb->aio_data; - ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, - (char __user *)(unsigned long)iocb->aio_buf, - iocb->aio_nbytes, - compat); - if (ret) - goto out_put_req; + get_file(file); + switch (iocb->aio_lio_opcode) { + case IOCB_CMD_PREAD: + ret = aio_read(&req->common, iocb, false, compat); + break; + case IOCB_CMD_PWRITE: + ret = aio_write(&req->common, iocb, false, compat); + break; + case IOCB_CMD_PREADV: + ret = aio_read(&req->common, iocb, true, compat); + break; + case IOCB_CMD_PWRITEV: + ret = aio_write(&req->common, iocb, true, compat); + break; + default: + pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); + ret = -EINVAL; + break; + } + fput(file); + if (ret && ret != -EIOCBQUEUED) + goto out_put_req; return 0; out_put_req: put_reqs_available(ctx, 1); @@ -1574,8 +1644,8 @@ out_put_req: return ret; } -long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +static long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1645,6 +1715,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return do_io_submit(ctx_id, nr, iocbpp, 0); } +#ifdef CONFIG_COMPAT +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} +#endif + /* lookup_kiocb * Finds a given iocb for cancellation. */ @@ -1744,3 +1852,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, } return ret; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) +{ + struct timespec t; + struct timespec __user *ut = NULL; + + if (timeout) { + if (compat_get_timespec(&t, timeout)) + return -EFAULT; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t))) + return -EFAULT; + } + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); +} +#endif |