diff options
Diffstat (limited to 'fs')
425 files changed, 15894 insertions, 11772 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef966188611..2b78014a124 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_uname) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_debug: v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; @@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltuid = option; break; case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltgid = option; break; case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->afid = option; break; case Opt_uname: diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 9c2bdda5cd9..598fff1a54e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (rdir->head < rdir->tail) { p9stat_init(&st); - err = p9stat_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, &st, - fid->clnt->proto_version); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; @@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (err == 0) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + filp->f_pos); if (err <= 0) goto unlock_and_exit; @@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { - err = p9dirent_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, - &curdirent, - fid->clnt->proto_version); + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e3c03db3c78..879ed885173 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -278,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFSOCK: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); @@ -1140,7 +1138,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; @@ -1166,7 +1164,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* HARDLINKCOUNT %u */ sscanf(ext, "%13s %u", tag_name, &i_nlink); if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) - inode->i_nlink = i_nlink; + set_nlink(inode, i_nlink); } } mode = stat->mode & S_IALLUGO; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index aded79fcd5c..0b5745e2194 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_ctime.tv_nsec = stat->st_ctime_nsec; inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { inode->i_mode = stat->st_mode; if ((S_ISBLK(inode->i_mode)) || diff --git a/fs/Kconfig b/fs/Kconfig index 9fe0b349f4c..5f4c45d4aa1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -109,7 +109,7 @@ source "fs/proc/Kconfig" source "fs/sysfs/Kconfig" config TMPFS - bool "Virtual memory file system support (former shm fs)" + bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. diff --git a/fs/Makefile b/fs/Makefile index afc109691a9..d2c3353d547 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index d5250c5aae2..1dab6a174d6 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_gid = ADFS_SB(sb)->s_gid; inode->i_ino = obj->file_id; inode->i_size = obj->size; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 3a4557e8325..de37ec84234 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry) break; default: if (!AFFS_TAIL(sb, bh)->link_chain) - inode->i_nlink = 1; + set_nlink(inode, 1); } affs_free_block(sb, link_ino); goto done; @@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry) if (inode->i_nlink > 1) retval = affs_remove_link(dentry); else - inode->i_nlink = 0; + clear_nlink(inode); affs_unlock_link(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 5d828903ac6..88a4b0b5005 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) prot = be32_to_cpu(tail->protect); inode->i_size = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mode = 0; AFFS_I(inode)->i_extcnt = 1; AFFS_I(inode)->i_ext_last = ~1; @@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) sbi->s_hashsize + 1; } if (tail->link_chain) - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; @@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_ino = block; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; @@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); - inode->i_nlink = 2; + set_nlink(inode, 2); ihold(inode); } affs_fix_checksum(sb, bh); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index e3e9efc1fdd..780a11dc631 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; error = affs_add_entry(dir, inode, dentry, ST_FILE); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); return error; } @@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) error = affs_add_entry(dir, inode, dentry, ST_USERDIR); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; @@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return 0; err: - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 346e3289abd..2f213d109c2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; vnode->vfs_inode.i_generation = vnode->fid.unique; - vnode->vfs_inode.i_nlink = status->nlink; + set_nlink(&vnode->vfs_inode, status->nlink); mode = vnode->vfs_inode.i_mode; mode &= ~S_IALLUGO; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fdab6e03d8..d890ae3b2ce 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) fscache_attr_changed(vnode->cache); #endif - inode->i_nlink = vnode->status.nlink; + set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; inode->i_gid = 0; inode->i_size = vnode->status.size; @@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_uid = 0; inode->i_gid = 0; inode->i_ctime.tv_sec = get_seconds(); @@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm) static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - struct aio_ring *ring; - int okay = 0; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) @@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; - /* Check if the completion queue has enough free space to - * accept an event from this io. - */ + return req; +} + +/* + * struct kiocb's are allocated in batches to reduce the number of + * times the ctx lock is acquired and released. + */ +#define KIOCB_BATCH_SIZE 32L +struct kiocb_batch { + struct list_head head; + long count; /* number of requests left to allocate */ +}; + +static void kiocb_batch_init(struct kiocb_batch *batch, long total) +{ + INIT_LIST_HEAD(&batch->head); + batch->count = total; +} + +static void kiocb_batch_free(struct kiocb_batch *batch) +{ + struct kiocb *req, *n; + + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + } +} + +/* + * Allocate a batch of kiocbs. This avoids taking and dropping the + * context lock a lot during setup. + */ +static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) +{ + unsigned short allocated, to_alloc; + long avail; + bool called_fput = false; + struct kiocb *req, *n; + struct aio_ring *ring; + + to_alloc = min(batch->count, KIOCB_BATCH_SIZE); + for (allocated = 0; allocated < to_alloc; allocated++) { + req = __aio_get_req(ctx); + if (!req) + /* allocation failed, go with what we've got */ + break; + list_add(&req->ki_batch, &batch->head); + } + + if (allocated == 0) + goto out; + +retry: spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); - if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + ring = kmap_atomic(ctx->ring_info.ring_pages[0]); + + avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + BUG_ON(avail < 0); + if (avail == 0 && !called_fput) { + /* + * Handle a potential starvation case. It is possible that + * we hold the last reference on a struct file, causing us + * to delay the final fput to non-irq context. In this case, + * ctx->reqs_active is artificially high. Calling the fput + * routine here may free up a slot in the event completion + * ring, allowing this allocation to succeed. + */ + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); + aio_fput_routine(NULL); + called_fput = true; + goto retry; + } + + if (avail < allocated) { + /* Trim back the number of requests. */ + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + if (--allocated <= avail) + break; + } + } + + batch->count -= allocated; + list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); ctx->reqs_active++; - okay = 1; } - kunmap_atomic(ring, KM_USER0); - spin_unlock_irq(&ctx->ctx_lock); - if (!okay) { - kmem_cache_free(kiocb_cachep, req); - req = NULL; - } + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); - return req; +out: + return allocated; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx, + struct kiocb_batch *batch) { struct kiocb *req; - /* Handle a potential starvation case -- should be exceedingly rare as - * requests will be stuck on fput_head only if the aio_fput_routine is - * delayed and the requests were the last user of the struct file. - */ - req = __aio_get_req(ctx); - if (unlikely(NULL == req)) { - aio_fput_routine(NULL); - req = __aio_get_req(ctx); - } + + if (list_empty(&batch->head)) + if (kiocb_batch_refill(ctx, batch) == 0) + return NULL; + req = list_first_entry(&batch->head, struct kiocb, ki_batch); + list_del(&req->ki_batch); return req; } @@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); if (ret < 0) goto out; @@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + struct iocb *iocb, struct kiocb_batch *batch, + bool compat) { struct kiocb *req; struct file *file; @@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!file)) return -EBADF; - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx, batch); /* returns with 2 references to req */ if (unlikely(!req)) { fput(file); return -EAGAIN; @@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, { struct kioctx *ctx; long ret = 0; - int i; + int i = 0; struct blk_plug plug; + struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } + kiocb_batch_init(&batch, nr); + blk_start_plug(&plug); /* @@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); if (ret) break; } blk_finish_plug(&plug); + kiocb_batch_free(&batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/attr.c b/fs/attr.c index 538e27959d3..7ee7ba48831 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -13,6 +13,7 @@ #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/security.h> +#include <linux/evm.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr) else error = simple_setattr(dentry, attr); - if (!error) + if (!error) { fsnotify_change(dentry, ia_valid); + evm_inode_post_setattr(dentry, ia_valid); + } return error; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 180fa2425e4..8179f1ab817 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 720d885e8dc..8342ca67abc 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -357,7 +357,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_gid = befs_sb->mount_opts.use_gid ? befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); - inode->i_nlink = 1; + set_nlink(inode, 1); /* * BEFS's time is 64 bits, but current VFS is 32 bits... diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index b14cebfd904..9cc07401947 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index a8e37f81d09..697af5bf70b 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); inode->i_uid = le32_to_cpu(di->i_uid); inode->i_gid = le32_to_cpu(di->i_gid); - inode->i_nlink = le32_to_cpu(di->i_nlink); + set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc56d3..21ac5ee4b43 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ba1a1ae4a18..1e9edbdeda7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -521,7 +521,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - dentry->d_inode->i_nlink--; + drop_nlink(dentry->d_inode); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2cd11..c2183f3917c 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> +#include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> @@ -255,7 +255,6 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); @@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio; @@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +static struct bio_map_data *bio_alloc_map_data(int nr_segs, + unsigned int iov_count, gfp_t gfp_mask) { struct bio_map_data *bmd; diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786ec7f0..b07f1da1de4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } @@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7ddf8a01c5..9c1eccc2c50 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1719,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_gid = btrfs_stack_inode_gid(inode_item); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); - inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c23b82d8ec0..9c1a744e595 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2015,7 +2015,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e6a832e3e64..352083ad233 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2266,9 +2266,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; + goto next; } list_del_init(&locked_ref->cluster); @@ -2288,7 +2286,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; - +next: + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -2316,6 +2318,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: @@ -3292,27 +3298,12 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - - /* - * we have two similar checks here, one based on percentage - * and once based on a hard number of 256MB. The idea - * is that if we have a good amount of free - * room, don't allocate a chunk. A good mount is - * less than 80% utilized of the chunks we have allocated, - * or more than 256MB free - */ - if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) - return 0; - - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) - return 0; - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 5% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + /* 256MB or 2% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3451,7 +3442,8 @@ static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); if (reserved > space_info->bytes_may_use) @@ -5329,15 +5321,6 @@ alloc: if (unlikely(block_group->ro)) goto loop; - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5383,8 +5366,15 @@ refill_cluster: * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. */ - if (loop >= LOOP_NO_EMPTY_SIZE) { + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + last_ptr->block_group != block_group) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5395,6 +5385,11 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5435,6 +5430,15 @@ refill_cluster: } unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5472,9 +5476,6 @@ checks: goto loop; } - ins->objectid = search_start; - ins->offset = num_bytes; - if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cc7492c823f..97fbe939c05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1167,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1836,7 +1838,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) switch (origin) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek_unlocked(file, offset, origin); + offset = generic_file_llseek(file, offset, origin); goto out; case SEEK_DATA: case SEEK_HOLE: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec23d43d0c3..ce40db59c70 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2283,23 +2283,23 @@ out: static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; - bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(bytes, block_group->sectorsize); - total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, block_group->sectorsize); + min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2308,7 +2308,7 @@ again: i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -2318,10 +2318,9 @@ again: if (!found_bits) return -ENOSPC; - if (!found) { + if (!total_found) { start = i; cluster->max_size = 0; - found = true; } total_found += found_bits; @@ -2329,13 +2328,8 @@ again: if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } @@ -2351,23 +2345,23 @@ again: /* * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 max_gap = 128 * 1024; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2377,8 +2371,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap) { - if (list_empty(&entry->list)) + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2391,12 +2385,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; - prev = entry; - while (window_free <= min_bytes) { - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2405,26 +2396,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (entry->offset - (prev->offset + prev->bytes) > max_gap || - entry->offset - window_start > (min_bytes * 2)) { - first = entry; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) max_extent = entry->bytes; - } else { - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - prev = entry; } + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + cluster->window_start = first->offset; node = &first->offset_index; @@ -2438,7 +2421,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap) + if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); @@ -2460,7 +2443,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2485,7 +2468,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (entry->bytes < min_bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); + bytes, cont1_bytes, min_bytes); if (!ret) return 0; } @@ -2499,7 +2482,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2515,23 +2498,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } spin_lock(&ctl->tree_lock); @@ -2539,7 +2523,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < min_bytes) { + if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2553,10 +2537,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, } ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes, min_bytes); + bytes + empty_size, + cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes, min_bytes); + offset, bytes + empty_size, + cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13b0542015f..fd1a06df5bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2358,7 +2358,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); inode->i_uid = btrfs_inode_uid(leaf, inode_item); inode->i_gid = btrfs_inode_gid(leaf, inode_item); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); @@ -6698,7 +6698,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 063b521e3de..5a7227fa938 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -833,13 +833,9 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { - struct super_block *s; struct dentry *root; struct vfsmount *mnt; - struct mnt_namespace *ns_private; char *newargs; - struct path path; - int error; newargs = setup_root_args(data); if (!newargs) @@ -850,39 +846,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (IS_ERR(mnt)) return ERR_CAST(mnt); - ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) { - mntput(mnt); - return ERR_CAST(ns_private); - } + root = mount_subtree(mnt, subvol_name); - /* - * This will trigger the automount of the subvol so we can just - * drop the mnt we have here and return the dentry that we - * found. - */ - error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, - LOOKUP_FOLLOW, &path); - put_mnt_ns(ns_private); - if (error) - return ERR_PTR(error); - - if (!is_subvolume_inode(path.dentry->d_inode)) { - path_put(&path); - mntput(mnt); - error = -EINVAL; + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); - return ERR_PTR(-EINVAL); } - /* Get a ref to the sb and the dentry we found and return it */ - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - root = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); - return root; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3..360c2dfd1ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 4) { + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f4d81c06d48..3568374d419 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1031,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } btrfs_release_path(path); if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d73439b4d7d..9489a2aca47 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3162,7 +3162,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 8 * 1024 * 1024; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a76e41c04b7..3848b04e310 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -394,36 +394,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b048ade..19d8eb7fdc8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -213,13 +213,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { + char b[BDEVNAME_SIZE]; + printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + printk("device %s blocksize: %d\n", bdevname(bdev, b), + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); @@ -285,7 +288,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { @@ -1470,13 +1473,13 @@ static void discard_buffer(struct buffer_head * bh) } /** - * block_invalidatepage - invalidate part of all of a buffer-backed page + * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected * @offset: the index of the truncation point * * block_invalidatepage() is called when all or part of the page has become - * invalidatedby a truncate operation. + * invalidated by a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5a3953db811..173b1d22e59 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); /* dirty the head */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; @@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page) ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* now adjust page */ spin_lock_irq(&mapping->tree_lock); @@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op. */ -static struct page **page_vector_from_list(struct list_head *page_list, - unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct page **pages; - struct page *page; - int next_index, contig_pages = 0; + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + int rc, bytes; + int i; - /* build page vector */ - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + rc = le32_to_cpu(replyhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); - BUG_ON(list_empty(page_list)); - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout("readpages page %d %p\n", contig_pages, page); - pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + + /* unlock all pages, zeroing any data we didn't read */ + for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { + struct page *page = req->r_pages[i]; + + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); } - *nr_pages = contig_pages; - return pages; + kfree(req->r_pages); } /* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list, int max) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; - int rc = 0; - struct page **pages; - loff_t offset; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_osd_request *req; + u64 off; u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; - dout("readpages %p file %p nr_pages %d\n", - inode, file, nr_pages); - - pages = page_vector_from_list(page_list, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); + off = page->index << PAGE_CACHE_SHIFT; - /* guess read extent */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } len = nr_pages << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - offset, &len, - ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages, 0); - if (rc == -ENOENT) - rc = 0; - if (rc < 0) - goto out; - - for (; !list_empty(page_list) && len > 0; - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { - struct page *page = - list_entry(page_list->prev, struct page, lru); + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + off, &len, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, + ci->i_truncate_seq, ci->i_truncate_size, + NULL, false, 1, 0); + if (!req) + return -ENOMEM; + /* build page vector */ + nr_pages = len >> PAGE_CACHE_SHIFT; + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); list_del(&page->lru); - - if (rc < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = rc < 0 ? 0 : rc; - zero_user_segment(page, s, PAGE_CACHE_SIZE); - } - - if (add_to_page_cache_lru(page, mapping, page->index, + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { page_cache_release(page); - dout("readpages %p add_to_page_cache failed %p\n", + dout("start_read %p add_to_page_cache failed %p\n", inode, page); - continue; + nr_pages = i; + goto out_pages; } - dout("readpages %p adding %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); + pages[i] = page; } - rc = 0; + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_callback = finish_read; + req->r_inode = inode; + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; +out_pages: + ceph_release_page_vector(pages, nr_pages); +out: + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int rc = 0; + int max = 0; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + + dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + max); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list, max); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } out: - kfree(pages); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -338,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -354,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return snapc; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8d74ad7ba55..8b53193e4f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, /* * Find ceph_cap for given mds, if any. * - * Called with i_lock held. + * Called with i_ceph_lock held. */ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { @@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return cap; } @@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci) int ceph_get_cap_mds(struct inode *inode) { + struct ceph_inode_info *ci = ceph_inode(inode); int mds; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; } /* - * Called under i_lock. + * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) @@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * * If I_FLUSH is set, leave the inode at the front of the list. * - * Caller holds i_lock + * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, @@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, /* * Cancel delayed work on cap. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) @@ -487,17 +488,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, clear D_COMPLETE; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - } + if (S_ISDIR(ci->vfs_inode.i_mode)) + ceph_dir_clear_complete(&ci->vfs_inode); } } @@ -534,14 +533,14 @@ int ceph_add_cap(struct inode *inode, wanted |= ceph_caps_for_mode(fmode); retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { if (new_cap) { cap = new_cap; new_cap = NULL; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); new_cap = get_cap(mdsc, caps_reservation); if (new_cap == NULL) return -ENOMEM; @@ -627,7 +626,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); wake_up_all(&ci->i_cap_wq); return 0; } @@ -794,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) struct rb_node *p; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (__cap_is_valid(cap) && @@ -803,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) break; } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); return ret; @@ -857,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) } /* - * called under i_lock + * called under i_ceph_lock */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { @@ -867,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * - * caller should hold i_lock. + * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap) @@ -945,7 +944,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -1030,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, /* * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_lock. + * inode is about to be destroyed, there is no need for i_ceph_lock. */ void ceph_queue_caps_release(struct inode *inode) { @@ -1051,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode) /* * Send a cap msg on the given inode. Update our caps state, then - * drop i_lock and send the message. + * drop i_ceph_lock and send the message. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. @@ -1063,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode) * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * - * called with i_lock, then drops it. + * called with i_ceph_lock, then drops it. * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, unsigned *pflush_tid) - __releases(cap->ci->vfs_inode->i_lock) + __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; @@ -1172,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, @@ -1200,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * Unless @again is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * - * Called under i_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, int again) - __releases(ci->vfs_inode->i_lock) - __acquires(ci->vfs_inode->i_lock) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; int mds; @@ -1263,7 +1262,7 @@ retry: session = NULL; } if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); @@ -1277,7 +1276,7 @@ retry: * deletion or migration. retry, and we'll * get a better @mds value next time. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1287,7 +1286,7 @@ retry: list_del_init(&capsnap->flushing_item); list_add_tail(&capsnap->flushing_item, &session->s_cap_snaps_flushing); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); @@ -1304,7 +1303,7 @@ retry: next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1324,11 +1323,9 @@ out: static void ceph_flush_snaps(struct ceph_inode_info *ci) { - struct inode *inode = &ci->vfs_inode; - - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_flush_snaps(ci, NULL, 0); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -1375,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_lock. + * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) @@ -1423,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); invalidate_mapping_pages(&inode->i_data, 0, -1); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { @@ -1472,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (mdsc->stopping) is_delayed = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; @@ -1482,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); @@ -1636,7 +1633,7 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; @@ -1650,7 +1647,7 @@ ack: if (down_read_trylock(&mdsc->snap_rwsem) == 0) { dout("inverting snap/in locks on %p\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); took_snap_rwsem = 1; goto retry; @@ -1666,10 +1663,10 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ sent++; - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, retain, flushing, NULL); - goto retry; /* retake i_lock and restart our cap scan. */ + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* @@ -1683,7 +1680,7 @@ ack: else if (!is_delayed || force_requeue) __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_invalidate) ceph_queue_invalidate(inode); @@ -1706,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, int flushing = 0; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; @@ -1718,7 +1715,7 @@ retry: int delayed; if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); session = cap->session; mutex_lock(&session->s_mutex); goto retry; @@ -1729,18 +1726,18 @@ retry: flushing = __mark_caps_flushing(inode, session); - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, cap->issued | cap->implemented, flushing, flush_tid); if (!delayed) goto out_unlocked; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); out_unlocked: if (session && unlock_session) mutex_unlock(&session->s_mutex); @@ -1755,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) struct ceph_inode_info *ci = ceph_inode(inode); int i, ret = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1763,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) ret = 0; break; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1870,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return err; } @@ -1896,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, @@ -1906,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1923,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p %s\n", inode, @@ -1934,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } } @@ -1954,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); @@ -1966,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1980,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. * - * Protected by i_lock. + * Protected by i_ceph_lock. */ static void __take_cap_refs(struct ceph_inode_info *ci, int got) { @@ -2018,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); @@ -2079,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(have), ceph_cap_string(need)); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; @@ -2096,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) int check = 0; /* do we need to explicitly request a larger max_size? */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if ((endoff >= ci->i_max_size || endoff > (inode->i_size << 1)) && endoff > ci->i_wanted_max_size) { @@ -2105,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) ci->i_wanted_max_size = endoff; check = 1; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } @@ -2142,9 +2139,9 @@ retry: */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); __take_cap_refs(ci, caps); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2162,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) int last = 0, put = 0, flushsnaps = 0, wake = 0; struct ceph_cap_snap *capsnap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) @@ -2195,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); @@ -2227,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, int found = 0; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; last = !ci->i_wrbuffer_ref; @@ -2276,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2293,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * - * caller holds s_mutex and i_lock, we drop both. + * caller holds s_mutex and i_ceph_lock, we drop both. * * return value: * 0 - ok @@ -2304,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2363,7 +2360,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + set_nlink(inode, le32_to_cpu(grant->nlink)); if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2455,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } BUG_ON(cap->issued & ~cap->implemented); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (writeback) /* * queue inode for writeback: we can't actually call @@ -2485,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -2541,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, wake_up_all(&ci->i_cap_wq); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2564,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { if (capsnap->flush_tid != flush_tid) { @@ -2587,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2600,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, static void handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2619,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -2648,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure we haven't seen a higher mseq */ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { @@ -2692,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, } /* else, we already released it */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2747,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, up_read(&mdsc->snap_rwsem); /* make sure we re-request max_size, if necessary */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_requested_max_size = 0; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2764,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; + struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; int mds = session->s_mds; @@ -2817,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); if (!inode) { @@ -2846,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session, } /* the rest require a cap */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto flush_cap_releases; } - /* note that each of these drops i_lock for us */ + /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: @@ -2871,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; default: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } @@ -2964,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) struct inode *inode = &ci->vfs_inode; int last = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); BUG_ON(ci->i_nr_by_mode[fmode] == 0); if (--ci->i_nr_by_mode[fmode] == 0) last++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) ceph_check_caps(ci, 0, NULL); @@ -2993,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int used, dirty; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -3048,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, inode, cap, ceph_cap_string(cap->issued)); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -3063,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, /* * force an record for the directory caps if we have a dentry lease. - * this is racy (can't take i_lock and d_lock together), but it + * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 382abc9a6a5..98954003a8d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * I_COMPLETE tells indicates we have all dentries in the dir. It is + * D_COMPLETE tells indicates we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -199,8 +199,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_test_complete(dir)) { + dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) } /* can we use the dcache? */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(inode) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(filp, dirent, filldir); if (err != -EAGAIN) return err; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, @@ -351,7 +351,7 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude I_COMPLETE */ + fi->dir_release_count--; /* preclude D_COMPLETE */ } /* note next offset and last dentry name */ @@ -428,13 +428,12 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_release_count == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = filp->f_pos; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("readdir %p filp %p done.\n", inode, filp); return 0; @@ -608,21 +607,21 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(dir) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); di->lease_shared_gen = ci->i_shared_gen; return NULL; } - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); } op = ceph_snap(dir) == CEPH_SNAPDIR ? @@ -842,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); ci->i_ceph_flags |= CEPH_I_NODELAY; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return drop; } @@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, */ /* d_move screws up d_subdirs order */ - ceph_i_clear(new_dir, CEPH_I_COMPLETE); + ceph_dir_clear_complete(new_dir); d_move(old_dentry, new_dentry); @@ -1016,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); int valid = 0; - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_shared_gen == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)ci->i_shared_gen, dentry, (unsigned)di->lease_shared_gen, valid); @@ -1092,7 +1091,52 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * Set/clear/test dir complete flag on the dir's dentry. + */ +void ceph_dir_set_complete(struct inode *inode) +{ + /* not yet implemented */ +} + +void ceph_dir_clear_complete(struct inode *inode) +{ + /* not yet implemented */ +} + +bool ceph_dir_test_complete(struct inode *inode) +{ + /* not yet implemented */ + return false; +} + +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + dout("ceph_d_prune %p\n", dentry); + /* do we have a valid parent? */ + if (!dentry->d_parent || IS_ROOT(dentry)) + return; + + /* if we are not hashed, we don't affect D_COMPLETE */ + if (d_unhashed(dentry)) + return; + + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + di = ceph_dentry(dentry->d_parent); + clear_bit(CEPH_D_COMPLETE, &di->flags); +} /* * read() on a dir. This weird interface hack only works if mounted @@ -1306,6 +1350,7 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; const struct dentry_operations ceph_snapdir_dentry_ops = { @@ -1315,4 +1360,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = { const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce549d31eeb..ed72428d9c7 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file) /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file) * write) or any MDS (for read). Update wanted set * asynchronously. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); @@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file) inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ if ((issued & wanted) != wanted && @@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file) } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); @@ -743,9 +743,9 @@ retry_snap: */ int dirty; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_put_cap_refs(ci, got); ret = generic_file_aio_write(iocb, iov, nr_segs, pos); @@ -764,9 +764,9 @@ retry_snap: if (ret >= 0) { int dirty; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); } @@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - if (origin != SEEK_CUR || origin != SEEK_SET) { + + if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 095799ba9dd..87fb132fb33 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,7 +9,6 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> -#include <linux/pagevec.h> #include "super.h" #include "mds_client.h" @@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) dout("alloc_inode %p\n", &ci->vfs_inode); + spin_lock_init(&ci->i_ceph_lock); + ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; @@ -584,7 +585,7 @@ static int fill_inode(struct inode *inode, iinfo->xattr_len); } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* * provided version will be odd if inode value is projected, @@ -619,7 +620,7 @@ static int fill_inode(struct inode *inode, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(info->nlink); + set_nlink(inode, le32_to_cpu(info->nlink)); /* be careful with mtime, atime, size */ ceph_decode_timespec(&atime, &info->atime); @@ -681,7 +682,7 @@ static int fill_inode(struct inode *inode, char *sym; BUG_ON(symlen != inode->i_size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = -ENOMEM; sym = kmalloc(symlen+1, GFP_NOFS); @@ -690,7 +691,7 @@ static int fill_inode(struct inode *inode, memcpy(sym, iinfo->symlink, symlen); sym[symlen] = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) ci->i_symlink = sym; else @@ -716,7 +717,7 @@ static int fill_inode(struct inode *inode, } no_change: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* queue truncate if we saw i_size decrease */ if (queue_trunc) @@ -751,13 +752,13 @@ no_change: info->cap.flags, caps_reservation); } else { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else if (cap_fmode >= 0) { pr_warning("mds issued no caps on %llx.%llx\n", @@ -772,9 +773,9 @@ no_change: ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + !ceph_dir_test_complete(inode)) { dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = 2; } @@ -850,19 +851,20 @@ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; struct inode *inode = dir->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_dentry_info *di; BUG_ON(!inode); di = ceph_dentry(dn); - spin_lock(&inode->i_lock); - if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + if (!ceph_dir_test_complete(inode)) { + spin_unlock(&ci->i_ceph_lock); return; } di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); spin_lock(&dir->d_lock); spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); @@ -1057,7 +1059,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate * directory offset so we can behave when holding - * I_COMPLETE. + * D_COMPLETE. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1309,7 +1311,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) struct ceph_inode_info *ci = ceph_inode(inode); int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); inode->i_size = size; inode->i_blocks = (size + (1 << 9) - 1) >> 9; @@ -1319,7 +1321,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) (ci->i_reported_size << 1) < ci->i_max_size) ret = 1; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1329,12 +1331,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) */ void ceph_queue_writeback(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); } } @@ -1354,55 +1357,13 @@ static void ceph_writeback_work(struct work_struct *work) */ void ceph_queue_invalidate(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); - } -} - -/* - * invalidate any pages that are not dirty or under writeback. this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ - struct pagevec pvec; - pgoff_t next = 0; - int i; - - pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t index; - int skip_page = - (PageDirty(page) || PageWriteback(page)); - - if (!skip_page) - skip_page = !trylock_page(page); - - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ - index = page->index; - if (index > next) - next = index; - next++; - - if (skip_page) - continue; - - generic_error_remove_page(mapping, page); - unlock_page(page); - } - pagevec_release(&pvec); - cond_resched(); + iput(inode); } } @@ -1418,20 +1379,20 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto out; } orig_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - ceph_invalidate_nondirty_pages(inode->i_mapping); + truncate_inode_pages(&inode->i_data, 0); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, @@ -1443,7 +1404,7 @@ static void ceph_invalidate_work(struct work_struct *work) inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, 0, NULL); @@ -1478,13 +1439,14 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + ihold(inode); if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); + iput(inode); } } @@ -1501,10 +1463,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) int wrbuffer_refs, wake = 0; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return; } @@ -1515,7 +1477,7 @@ retry: if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1525,15 +1487,15 @@ retry: wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, ci->i_truncate_pending, to); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); truncate_inode_pages(inode->i_mapping, to); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_truncate_pending--; if (ci->i_truncate_pending == 0) wake = 1; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -1588,7 +1550,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (IS_ERR(req)) return PTR_ERR(req); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); @@ -1736,7 +1698,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) } release &= issued; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1758,7 +1720,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __ceph_do_pending_vmtruncate(inode); return err; out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 3b256b50f7d..790914a598d 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout nl; int err, i; - /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) + /* validate changed params against current layout */ + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + nl.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + } else + return err; + + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + if (l.object_size) + nl.object_size = l.object_size; + if (l.data_pool) + nl.data_pool = l.data_pool; + if (l.preferred_osd) + nl.preferred_osd = l.preferred_osd; + + if ((nl.object_size & ~PAGE_MASK) || + (nl.stripe_unit & ~PAGE_MASK) || + ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) return -EINVAL; /* make sure it's a valid data pool */ @@ -219,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[fi->fmode]++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); ceph_check_caps(ci, 0, NULL); diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 0c5167e4318..be4a6048733 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -6,7 +6,31 @@ #define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; @@ -21,6 +45,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * * Extract identity, address of the OSD and object storing a given * file offset. */ @@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 86c59e16ba7..6203d805eb4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -struct dentry *get_nonsnap_parent(struct dentry *dentry) +static struct dentry *get_nonsnap_parent(struct dentry *dentry) { /* * we don't need to worry about protecting the d_parent access @@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = NULL; if (mode == USE_AUTH_MDS) cap = ci->i_auth_cap; if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto random; } mds = cap->session->s_mds; dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; random: @@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; @@ -950,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = @@ -983,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); while (drop--) iput(inode); return 0; @@ -1014,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (arg) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return 0; } @@ -1150,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (session->s_trim_caps <= 0) return -1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); oissued = __ceph_caps_issued_other(ci, cap); @@ -1169,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) __ceph_remove_cap(cap); } else { /* try to drop referring dentries */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); d_prune_aliases(inode); dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, atomic_read(&inode->i_count)); @@ -1177,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return 0; } @@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS); + GFP_NOFS, false); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1295,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) i_flushing_item); struct inode *inode = &ci->vfs_inode; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_cap_flush_seq <= want_flush_seq) { dout("check_cap_flush still flushing %p " "seq %lld <= %lld to mds%d\n", inode, @@ -1303,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) session->s_mds); ret = 0; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); @@ -1494,6 +1495,7 @@ retry: pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { + spin_unlock(&temp->d_lock); break; } else { pos -= temp->d_name.len; @@ -1652,7 +1654,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -2001,7 +2003,7 @@ out: } /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) @@ -2009,11 +2011,11 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) struct inode *inode = req->r_locked_dir; struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); - spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); + spin_lock(&ci->i_ceph_lock); + ceph_dir_clear_complete(inode); ci->i_release_count++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); @@ -2421,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (err) goto out_free; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ @@ -2444,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v1.pathbase = cpu_to_le64(pathbase); reclen = sizeof(rec.v1); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; @@ -2518,7 +2520,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -2831,7 +2833,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; @@ -3153,7 +3155,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4bb239921db..a50ca0e3947 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -20,7 +20,7 @@ * * mdsc->snap_rwsem * - * inode->i_lock + * ci->i_ceph_lock * mdsc->snap_flush_lock * mdsc->cap_delay_lock * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e2643719133..a559c80f127 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) return; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) kfree(capsnap); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) * * If capsnap can now be flushed, add to snap_flush list, and return 1. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) @@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc) inode = &ci->vfs_inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_flush_snaps(ci, &session, 0); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); iput(inode); spin_lock(&mdsc->snap_flush_lock); } @@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, continue; ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_snap_realm) goto skip_inode; /* @@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, oldrealm = ci->i_snap_realm; ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, oldrealm); @@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, continue; skip_inode: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); iput(inode); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 88bacaf385d..b48f15f101a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) enum { Opt_wsize, Opt_rsize, + Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -136,6 +137,7 @@ enum { static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, + {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_rsize: fsopt->rsize = intval; break; + case Opt_rasize: + fsopt->rasize = intval; + break; case Opt_caps_wanted_delay_min: fsopt->caps_wanted_delay_min = intval; break; @@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); @@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -418,24 +426,27 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) /* * create a new fs client */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; + const unsigned supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + const unsigned required_features = 0; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc); + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; @@ -491,7 +502,7 @@ fail: return ERR_PTR(err); } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); @@ -627,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) + fsc->sb->s_root == NULL) { root = d_alloc_root(req->r_target_inode); - else + ceph_init_dentry(root); + } else { root = d_obtain_alias(req->r_target_inode); + } req->r_target_inode = NULL; dout("open_root_inode success, root dentry is %p\n", root); } else { @@ -774,10 +787,10 @@ static int ceph_register_bdi(struct super_block *sb, { int err; - /* set ra_pages based on rsize mount option? */ - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a23eed526f0..edcbf3774a5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -45,8 +46,9 @@ struct ceph_mount_options { int flags; int sb_flags; - int wsize; - int rsize; /* max readahead */ + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; int cap_release_safety; @@ -201,6 +203,7 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -211,6 +214,18 @@ struct ceph_dentry_info { u64 offset; }; +/* + * dentry flags + * + * The locking for D_COMPLETE is a bit odd: + * - we can clear it at almost any time (see ceph_d_prune) + * - it is only meaningful if: + * - we hold dir inode i_ceph_lock + * - we hold dir FILE_SHARED caps + * - the dentry D_COMPLETE is set + */ +#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -235,6 +250,8 @@ struct ceph_inode_xattrs_info { struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ + spinlock_t i_ceph_lock; + u64 i_version; u32 i_time_warp_seq; @@ -249,14 +266,14 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; - /* capabilities. protected _both_ by i_lock and cap->session's + /* capabilities. protected _both_ by i_ceph_lock and cap->session's * s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ @@ -344,9 +361,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) * x86_64+ino32 64 32 * x86_64 64 64 */ -static inline u32 ceph_ino_to_ino32(ino_t ino) +static inline u32 ceph_ino_to_ino32(__u64 vino) { - ino ^= ino >> (sizeof(ino) * 8 - 32); + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; if (!ino) ino = 1; return ino; @@ -357,11 +375,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino) */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino = ceph_ino_to_ino32(ino); + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; #endif - return ino; } /* @@ -413,7 +431,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ #define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ @@ -422,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } static inline void ceph_i_set(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags |= mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } static inline bool ceph_i_test(struct inode *inode, unsigned mask) @@ -441,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) struct ceph_inode_info *ci = ceph_inode(inode); bool r; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return r; } @@ -471,6 +488,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* + * set/clear directory D_COMPLETE flag + */ +void ceph_dir_set_complete(struct inode *inode); +void ceph_dir_clear_complete(struct inode *inode); +bool ceph_dir_test_complete(struct inode *inode); + +/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -486,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, static inline int ceph_caps_issued(struct ceph_inode_info *ci) { int issued; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return issued; } @@ -496,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { int r; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); r = __ceph_caps_issued_mask(ci, mask, touch); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return r; } @@ -721,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode, extern void __ceph_remove_cap(struct ceph_cap *cap); static inline void ceph_remove_cap(struct ceph_cap *cap) { - struct inode *inode = &cap->ci->vfs_inode; - spin_lock(&inode->i_lock); + spin_lock(&cap->ci->i_ceph_lock); __ceph_remove_cap(cap); - spin_unlock(&inode->i_lock); + spin_unlock(&cap->ci->i_ceph_lock); } extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 96c6739a028..a5e36e4488a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) } static int __build_xattrs(struct inode *inode) - __releases(inode->i_lock) - __acquires(inode->i_lock) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { u32 namelen; u32 numattr = 0; @@ -372,7 +372,7 @@ start: end = p + ci->i_xattrs.blob->vec.iov_len; ceph_decode_32_safe(&p, end, numattr, bad); xattr_version = ci->i_xattrs.version; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), GFP_NOFS); @@ -387,7 +387,7 @@ start: goto bad_lock; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.version != xattr_version) { /* lost a race, retry */ for (i = 0; i < numattr; i++) @@ -418,7 +418,7 @@ start: return err; bad_lock: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); bad: if (xattrs) { for (i = 0; i < numattr; i++) @@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (vxattrs) vxattr = ceph_match_vxattr(vxattrs, name); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* get xattrs from mds (if we don't already have them) */ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (vxattr && vxattr->readonly) { err = vxattr->getxattr_cb(ci, value, size); @@ -558,7 +558,7 @@ get_xattr: memcpy(value, xattr->val, xattr->val_len); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } @@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) u32 len; int i; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); err = __build_xattrs(inode); if (err < 0) @@ -619,7 +619,7 @@ list_xattr: } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } @@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); if (!(issued & CEPH_CAP_XATTR_EXCL)) @@ -752,12 +752,12 @@ retry: required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob = NULL; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ci->i_xattrs.prealloc_blob = blob; @@ -770,13 +770,13 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __build_xattrs(inode); issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); @@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name) ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_send_removexattr(dentry, name); return err; } diff --git a/fs/cifs/README b/fs/cifs/README index c5c2c5e5f0f..895da1dc155 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -745,4 +745,18 @@ installed and something like the following lines should be added to the create cifs.spnego * * /usr/local/sbin/cifs.upcall %k create dns_resolver * * /usr/local/sbin/cifs.upcall %k +CIFS kernel module parameters +============================= +These module parameters can be specified or modified either during the time of +module loading or during the runtime by using the interface + /proc/module/cifs/parameters/<param> + +i.e. echo "value" > /sys/module/cifs/parameters/<param> + +1. echo_retries - The number of echo attempts before giving up and + reconnecting to the server. The default is 5. The value 0 + means never reconnect. + +2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. + [Y/y/1]. To disable use any of [N/n/0]. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6d40656e1e2..84e8c072470 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = { static int cifs_oplock_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%d\n", oplockEnabled); + seq_printf(m, "%d\n", enable_oplocks); return 0; } @@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file, char c; int rc; + printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface " + "will be removed in kernel version 3.4. Please migrate to " + "using the 'enable_oplocks' module parameter in cifs.ko.\n"); rc = get_user(c, buffer); if (rc) return rc; if (c == '0' || c == 'n' || c == 'N') - oplockEnabled = 0; + enable_oplocks = false; else if (c == '1' || c == 'y' || c == 'Y') - oplockEnabled = 1; + enable_oplocks = true; return count; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 7260e11e21f..500d6585927 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -43,6 +43,8 @@ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ #define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ +#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -55,6 +57,8 @@ struct cifs_sb_info { atomic_t active; uid_t mnt_uid; gid_t mnt_gid; + uid_t mnt_backupuid; + gid_t mnt_backupgid; mode_t mnt_file_mode; mode_t mnt_dir_mode; unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d0f59faefb7..72ddf23ef6f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); spin_unlock(&sidgidlock); + root = &siduidtree; + spin_lock(&uidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&gidsidlock); + return nr_rem; } +static void +sid_rb_insert(struct rb_root *root, unsigned long cid, + struct cifs_sid_id **psidid, char *typestr) +{ + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + if (cid > lsidid->id) { + linkto = &(node->rb_left); + node = node->rb_left; + } + if (cid < lsidid->id) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + (*psidid)->id = cid; + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sprintf(strptr, "%ld", cid); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +sid_rb_search(struct rb_root *root, unsigned long cid) +{ + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + if (cid > lsidid->id) + node = node->rb_left; + else if (cid < lsidid->id) + node = node->rb_right; + else /* node found */ + return lsidid; + } + + return NULL; +} + static struct shrinker cifs_shrinker = { .shrink = cifs_idmap_shrinker, .seeks = DEFAULT_SEEKS, @@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) memcpy(payload, data, datalen); key->payload.data = payload; + key->datalen = datalen; return 0; } @@ -224,6 +292,120 @@ sidid_pending_wait(void *unused) } static int +id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +{ + int rc = 0; + struct key *sidkey; + const struct cred *saved_cred; + struct cifs_sid *lsid; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -EINVAL; + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + sid_rb_insert(cidtree, cid, &psidid, + sidtype == SIDOWNER ? "oi:" : "gi:"); + ++psidid->refcount; + spin_unlock(cidlock); + } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + psidid->time = jiffies; /* update ts for accessing */ + goto id_sid_out; + } + + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { + rc = -EINVAL; + goto id_sid_out; + } + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map and id to a SID", __func__); + } else { + lsid = (struct cifs_sid *)sidkey->payload.data; + memcpy(&psidid->sid, lsid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + memcpy(ssid, &psidid->sid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(sidkey); + kfree(psidid->sidstr); + } + psidid->time = jiffies; /* update ts for accessing */ + revert_creds(saved_cred); + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + else + rc = -EINVAL; + } +id_sid_out: + --psidid->refcount; + return rc; +} + +static int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -383,6 +565,10 @@ init_cifs_idmap(void) spin_lock_init(&sidgidlock); gidtree = RB_ROOT; + spin_lock_init(&uidsidlock); + siduidtree = RB_ROOT; + spin_lock_init(&gidsidlock); + sidgidtree = RB_ROOT; register_shrinker(&cifs_shrinker); cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); @@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void) while ((node = rb_first(root))) rb_erase(node, root); spin_unlock(&sidgidlock); + + root = &siduidtree; + spin_lock(&uidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&gidsidlock); } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are @@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, acl_size = sizeof(struct cifs_acl); num_aces = le32_to_cpu(pdacl->num_aces); - if (num_aces > 0) { + if (num_aces > 0) { umode_t user_mask = S_IRWXU; umode_t group_mask = S_IRWXG; umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; @@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, else cFYI(1, "no ACL"); /* BB grant all or default perms? */ -/* cifscred->uid = owner_sid_ptr->rid; - cifscred->gid = group_sid_ptr->rid; - memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr, - sizeof(struct cifs_sid)); - memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, - sizeof(struct cifs_sid)); */ - return rc; } - /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - struct inode *inode, __u64 nmode) + __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ - if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) - return -EIO; - - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (nmode != NO_CHANGE_64) { /* chmod */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); - - dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; - - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); - - sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - - /* copy security descriptor control portion and owner and group sid */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + nmode); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy sec desc control portion & owner and group sids */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + *aclflag = CIFS_ACL_DACL; + } else { + memcpy(pnntsd, pntsd, secdesclen); + if (uid != NO_CHANGE_32) { /* chown */ + owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->osidoffset)); + nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for owner id %d", + __func__, rc, uid); + kfree(nowner_sid_ptr); + return rc; + } + memcpy(owner_sid_ptr, nowner_sid_ptr, + sizeof(struct cifs_sid)); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } + if (gid != NO_CHANGE_32) { /* chgrp */ + group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->gsidoffset)); + ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!ngroup_sid_ptr) + return -ENOMEM; + rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for group id %d", + __func__, rc, gid); + kfree(ngroup_sid_ptr); + return rc; + } + memcpy(group_sid_ptr, ngroup_sid_ptr, + sizeof(struct cifs_sid)); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } + } return rc; } @@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, { struct cifs_ntsd *pntsd = NULL; int oplock = 0; - int xid, rc; + int xid, rc, create_options = 0; __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); CIFSSMBClose(xid, tcon, fid); @@ -991,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, - struct cifs_ntsd *pnntsd, u32 acllen) + /* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path, int aclflag) { int oplock = 0; - int xid, rc; + int xid, rc, access_flags, create_options = 0; __u16 fid; struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1006,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) + access_flags = WRITE_OWNER; + else + access_flags = WRITE_DAC; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { cERROR(1, "Unable to open file to set ACL"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); cFYI(DBG2, "SetCIFSACL rc = %d", rc); CIFSSMBClose(xid, tcon, fid); @@ -1024,17 +1265,6 @@ out: return rc; } -/* Set an ACL on the server */ -int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, - struct inode *inode, const char *path) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - - cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); - - return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); -} - /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -1066,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) +int +id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, + uid_t uid, gid_t gid) { int rc = 0; + int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ @@ -1098,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) return -ENOMEM; } - rc = build_sec_desc(pntsd, pnntsd, inode, nmode); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); cFYI(DBG2, "build_sec_desc rc: %d", rc); if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path); + rc = set_cifs_acl(pnntsd, secdesclen, inode, + path, aclflag); cFYI(DBG2, "set_cifs_acl rc: %d", rc); } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 30acd22147e..5d9b9acc5fc 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,83 +37,8 @@ * the sequence number before this function is called. Also, this function * should be called with the server->srv_mutex held. */ -static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, char *signature) -{ - int rc; - - if (cifs_pdu == NULL || signature == NULL || server == NULL) - return -EINVAL; - - if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature\n", __func__); - return -1; - } - - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); - if (rc) { - cERROR(1, "%s: Could not init md5\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - server->session_key.response, server->session_key.len); - if (rc) { - cERROR(1, "%s: Could not update with response\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); - if (rc) { - cERROR(1, "%s: Could not update with payload\n", __func__); - return rc; - } - - rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - if (rc) - cERROR(1, "%s: Could not generate md5 hash\n", __func__); - - return rc; -} - -/* must be called with server->srv_mutex held */ -int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, - __u32 *pexpected_response_sequence_number) -{ - int rc = 0; - char smb_signature[20]; - - if ((cifs_pdu == NULL) || (server == NULL)) - return -EINVAL; - - if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || - server->tcpStatus == CifsNeedNegotiate) - return rc; - - if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); - return rc; - } - - cifs_pdu->Signature.Sequence.SequenceNumber = - cpu_to_le32(server->sequence_number); - cifs_pdu->Signature.Sequence.Reserved = 0; - - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; - - rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); - if (rc) - memset(cifs_pdu->Signature.SecuritySignature, 0, 8); - else - memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); - - return rc; -} - -static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - struct TCP_Server_Info *server, char *signature) +static int cifs_calc_signature(const struct kvec *iov, int n_vec, + struct TCP_Server_Info *server, char *signature) { int i; int rc; @@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, { int rc = 0; char smb_signature[20]; - struct smb_hdr *cifs_pdu = iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); return rc; } @@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; } -int cifs_verify_signature(struct smb_hdr *cifs_pdu, +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + struct kvec iov; + + iov.iov_base = cifs_pdu; + iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + + return cifs_sign_smb2(&iov, 1, server, + pexpected_response_sequence_number); +} + +int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cifs_pdu->Signature.Sequence.Reserved = 0; mutex_lock(&server->srv_mutex); - rc = cifs_calculate_signature(cifs_pdu, server, - what_we_think_sig_should_be); + rc = cifs_calc_signature(iov, nr_iov, server, + what_we_think_sig_should_be); mutex_unlock(&server->srv_mutex); if (rc) @@ -265,7 +204,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } /* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses) +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; @@ -282,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { cFYI(1, "%s Can't generate NTLM response, error: %d", __func__, rc); return rc; } - rc = E_md4hash(ses->password, temp_key); + rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -465,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + E_md4hash(ses->password, nt_hash, nls_cp); rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 54b8f1e7da9..8f1fe324162 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -53,7 +53,7 @@ int cifsFYI = 0; int cifsERROR = 1; int traceSMB = 0; -unsigned int oplockEnabled = 1; +bool enable_oplocks = true; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0); +module_param(cifs_max_pending, int, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 50 Range: 2 to 256"); unsigned short echo_retries = 5; @@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644); MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " "reconnecting server. Default: 5. 0 means " "never reconnect."); +module_param(enable_oplocks, bool, 0644); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" + "y/Y/1"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb) else sb->s_d_op = &cifs_dentry_ops; -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); sb->s_export_op = &cifs_export_ops; } -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ return 0; @@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) seq_printf(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + seq_printf(s, ",nostrictsync"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + seq_printf(s, ",noperm"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + seq_printf(s, ",strictcache"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); @@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *full_path = NULL; char *s, *p; char sep; - int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); @@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) cFYI(1, "Get root dentry for %s", full_path); - xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); p = s = full_path; @@ -570,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dentry); dentry = child; } while (!IS_ERR(dentry)); - _FreeXid(xid); kfree(full_path); return dentry; } @@ -723,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (rc < 0) return (loff_t)rc; } - return generic_file_llseek_unlocked(file, offset, origin); + return generic_file_llseek(file, offset, origin); } static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) @@ -942,7 +949,8 @@ cifs_init_once(void *inode) struct cifsInodeInfo *cifsi = inode; inode_init_once(&cifsi->vfs_inode); - INIT_LIST_HEAD(&cifsi->lockList); + INIT_LIST_HEAD(&cifsi->llist); + mutex_init(&cifsi->lock_mutex); } static int diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 95da8027983..30ff56005d8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.75" +#define CIFS_VERSION "1.76" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 95dad9d14cf..8238aa13e01 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -167,6 +167,8 @@ struct smb_vol { uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; + uid_t backupuid; + gid_t backupgid; mode_t file_mode; mode_t dir_mode; unsigned secFlg; @@ -179,6 +181,8 @@ struct smb_vol { bool noperm:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; + bool backupuid_specified; /* mount option backupuid is specified */ + bool backupgid_specified; /* mount option backupgid is specified */ bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; @@ -219,7 +223,8 @@ struct smb_vol { CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ - CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ + CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ MS_NODEV | MS_SYNCHRONOUS) @@ -286,7 +291,13 @@ struct TCP_Server_Info { bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool large_buf; /* is current buffer large? */ struct delayed_work echo; /* echo ping workqueue job */ + struct kvec *iov; /* reusable kvec array for receives */ + unsigned int nr_iov; /* number of kvecs in array */ + char *smallbuf; /* pointer to current "small" buffer */ + char *bigbuf; /* pointer to current "big" buffer */ + unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif @@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); */ struct cifsLockInfo { struct list_head llist; /* pointer to next cifsLockInfo */ + struct list_head blist; /* pointer to locks blocked on this */ + wait_queue_head_t block_q; __u64 offset; __u64 length; + __u32 pid; __u8 type; + __u16 netfid; }; /* @@ -520,8 +535,6 @@ struct cifsFileInfo { struct dentry *dentry; unsigned int f_flags; struct tcon_link *tlink; - struct mutex lock_mutex; - struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; int count; /* refcount protected by cifs_file_list_lock */ @@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { - struct list_head lockList; + struct list_head llist; /* brlocks for this inode */ + bool can_cache_brlcks; + struct mutex lock_mutex; /* protect two fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ @@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, struct mid_q_entry; /* - * This is the prototype for the mid callback function. When creating one, - * take special care to avoid deadlocks. Things to bear in mind: + * This is the prototype for the mid receive function. This function is for + * receiving the rest of the SMB frame, starting with the WordCount (which is + * just after the MID in struct smb_hdr). Note: + * + * - This will be called by cifsd, with no locks held. + * - The mid will still be on the pending_mid_q. + * - mid->resp_buf will point to the current buffer. + * + * Returns zero on a successful receive, or an error. The receive state in + * the TCP_Server_Info will also be updated. + */ +typedef int (mid_receive_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + +/* + * This is the prototype for the mid callback function. This is called once the + * mid has been received off of the socket. When creating one, take special + * care to avoid deadlocks. Things to bear in mind: * * - it will be called by cifsd, with no locks held * - the mid will be removed from any lists @@ -662,9 +693,10 @@ struct mid_q_entry { unsigned long when_sent; /* time when smb send finished */ unsigned long when_received; /* when demux complete (taken off wire) */ #endif + mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ void *callback_data; /* general purpose pointer for callback */ - struct smb_hdr *resp_buf; /* response buffer */ + struct smb_hdr *resp_buf; /* pointer to received SMB header */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ bool largeBuf:1; /* if valid response, is pointer to large buf */ @@ -964,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions to be established on existing mount if we have the uid/password or Kerberos credential or equivalent for current user */ -GLOBAL_EXTERN unsigned int oplockEnabled; +/* enable or disable oplocks */ +GLOBAL_EXTERN bool enable_oplocks; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ @@ -978,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ /* reconnect after this many failed echo attempts */ GLOBAL_EXTERN unsigned short echo_retries; +#ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; GLOBAL_EXTERN spinlock_t siduidlock; GLOBAL_EXTERN spinlock_t sidgidlock; +GLOBAL_EXTERN struct rb_root siduidtree; +GLOBAL_EXTERN struct rb_root sidgidtree; +GLOBAL_EXTERN spinlock_t uidsidlock; +GLOBAL_EXTERN spinlock_t gidsidlock; +#endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index de3aa285de0..3fb03e2c8e8 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp { __le16 DataLengthHigh; __u64 Reserved2; __u16 ByteCount; - __u8 Pad; /* BB check for whether padded to DWORD - boundary and optimum performance here */ - char Data[1]; + /* read response data immediately follows */ } __attribute__((packed)) READ_RSP; typedef struct locking_andx_range { @@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */ /* SETFSInfo Levels */ #define SMB_SET_CIFS_UNIX_INFO 0x200 +/* level 0x203 is defined above in list of QFS info levels */ +/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */ + +/* Level 0x200 request structure follows */ typedef struct smb_com_transaction2_setfsi_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req { __le64 ClientUnixCap; /* Data end */ } __attribute__((packed)) TRANSACTION2_SETFSI_REQ; +/* level 0x203 request structure follows */ +typedef struct smb_com_transaction2_setfs_enc_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 Reserved4; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + /* NTLMSSP Blob, Data start. */ +} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ; + +/* response for setfsinfo levels 0x200 and 0x203 */ typedef struct smb_com_transaction2_setfsi_rsp { struct smb_hdr hdr; /* wct = 10 */ struct trans2_resp t2; __u16 ByteCount; } __attribute__((packed)) TRANSACTION2_SETFSI_RSP; - typedef struct smb_com_transaction2_get_dfs_refer_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -2098,13 +2126,13 @@ typedef struct { #define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX -/* Can not set pathnames cap yet until we send new posix create SMB since - otherwise server can treat such handles opened with older ntcreatex - (by a new client which knows how to send posix path ops) - as non-posix handles (can affect write behavior with byte range locks. - We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ +/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send + LockingX instead of posix locking call on unix sess (and we do not expect + LockingX to use different (ie Windows) semantics than posix locking on + the same session (if WINE needs to do this later, we can add this cap + back in later */ /* #define CIFS_UNIX_CAP_MASK 0x000000fb */ -#define CIFS_UNIX_CAP_MASK 0x000000db +#define CIFS_UNIX_CAP_MASK 0x000003db #else #define CIFS_UNIX_CAP_MASK 0x00000013 #endif /* CONFIG_CIFS_POSIX */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8df28e925e5..6f4e243e0f6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, - void *cbdata, bool ignore_pend); + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, + bool ignore_pend); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); +extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); @@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, + uid_t, gid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, - const char *); + const char *, int); +extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); +extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read); +extern int cifs_readv_from_socket(struct TCP_Server_Info *server, + struct kvec *iov_orig, unsigned int nr_segs, + unsigned int to_read); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf); extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 netfid, const __u64 len, + const __u16 netfid, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, - const __u64 len, struct file_lock *, + const __u16 smb_file_id, const __u32 netpid, + const int get_flag, const __u64 len, struct file_lock *, const __u16 lock_type, const bool waitFlag); extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); extern int CIFSSMBEcho(struct TCP_Server_Info *server); @@ -380,11 +392,12 @@ extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); -extern int cifs_verify_signature(struct smb_hdr *, +extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); -extern int setup_ntlm_response(struct cifs_ses *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); @@ -419,7 +432,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, - struct cifs_ntsd *, __u32); + struct cifs_ntsd *, __u32, int); extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char *acl_inf, const int buflen, const int acl_type, @@ -436,10 +449,29 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); -extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); +/* asynchronous read support */ +struct cifs_readdata { + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + pid_t pid; + int result; + struct list_head pages; + struct work_struct work; + unsigned int nr_iov; + struct kvec iov[1]; +}; + +struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages); +void cifs_readdata_free(struct cifs_readdata *rdata); +int cifs_async_readv(struct cifs_readdata *rdata); + /* asynchronous write support */ struct cifs_writedata { struct kref refcount; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a80f7bd97b9..6600aa2d2ef 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -33,6 +33,8 @@ #include <linux/slab.h> #include <linux/posix_acl_xattr.h> #include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/task_io_accounting_ops.h> #include <asm/uaccess.h> #include "cifspdu.h" #include "cifsglob.h" @@ -40,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "fscache.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -83,6 +86,9 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ +/* Forward declarations */ +static void cifs_readv_complete(struct work_struct *work); + /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ static void mark_open_files_invalid(struct cifs_tcon *pTcon) @@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) } server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); server->maxReq = le16_to_cpu(rsp->MaxMpxCount); - server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), - (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ @@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) little endian */ server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); /* probably no need to store and check maxvcs */ - server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), - (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); @@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov.iov_base = smb; iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); + rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, + server, true); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -1376,6 +1381,359 @@ openRetry: return rc; } +struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages) +{ + struct cifs_readdata *rdata; + + /* readdata + 1 kvec for each page */ + rdata = kzalloc(sizeof(*rdata) + + sizeof(struct kvec) * nr_pages, GFP_KERNEL); + if (rdata != NULL) { + INIT_WORK(&rdata->work, cifs_readv_complete); + INIT_LIST_HEAD(&rdata->pages); + } + return rdata; +} + +void +cifs_readdata_free(struct cifs_readdata *rdata) +{ + cifsFileInfo_put(rdata->cfile); + kfree(rdata); +} + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length); + int remaining = rfclen + 4 - server->total_read; + struct cifs_readdata *rdata = mid->callback_data; + + while (remaining > 0) { + int length; + + length = cifs_read_from_socket(server, server->bigbuf, + min_t(unsigned int, remaining, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + dequeue_mid(mid, rdata->result); + return 0; +} + +static int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, remaining, data_len; + struct cifs_readdata *rdata = mid->callback_data; + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4; + u64 eof; + pgoff_t eof_index; + struct page *page, *tpage; + + cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__, + mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, rfclen, sizeof(*rsp)) - + sizeof(struct smb_hdr) + 1; + + rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1; + rdata->iov[0].iov_len = len; + + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + + /* Was the SMB read successful? */ + rdata->result = map_smb_to_linux_error(&rsp->hdr, false); + if (rdata->result != 0) { + cFYI(1, "%s: server returned error %d", __func__, + rdata->result); + return cifs_readv_discard(server, mid); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < sizeof(READ_RSP)) { + cFYI(1, "%s: server returned short header. got=%u expected=%zu", + __func__, server->total_read, sizeof(READ_RSP)); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = le16_to_cpu(rsp->DataOffset) + 4; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cFYI(1, "%s: data offset (%u) inside read response header", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cFYI(1, "%s: data offset (%u) beyond end of smallbuf", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cFYI(1, "%s: total_read=%u data_offset=%u", __func__, + server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + rdata->iov[0].iov_base = server->smallbuf + server->total_read; + rdata->iov[0].iov_len = len; + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* set up first iov for signature check */ + rdata->iov[0].iov_base = server->smallbuf; + rdata->iov[0].iov_len = server->total_read; + cFYI(1, "0: iov_base=%p iov_len=%zu", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + + /* how much data is in the response? */ + data_len = le16_to_cpu(rsp->DataLengthHigh) << 16; + data_len += le16_to_cpu(rsp->DataLength); + if (data_offset + data_len > rfclen) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + /* marshal up the page array */ + len = 0; + remaining = data_len; + rdata->nr_iov = 1; + + /* determine the eof that the server (probably) has */ + eof = CIFS_I(rdata->mapping->host)->server_eof; + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + if (remaining >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + ++rdata->nr_iov; + len += PAGE_CACHE_SIZE; + remaining -= PAGE_CACHE_SIZE; + } else if (remaining > 0) { + /* enough for partial page, fill and zero the rest */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = remaining; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + memset(rdata->iov[rdata->nr_iov].iov_base + remaining, + '\0', PAGE_CACHE_SIZE - remaining); + ++rdata->nr_iov; + len += remaining; + remaining = 0; + } else if (page->index > eof_index) { + /* + * The VFS will not try to do readahead past the + * i_size, but it's possible that we have outstanding + * writes with gaps in the middle and the i_size hasn't + * caught up yet. Populate those with zeroed out pages + * to prevent the VFS from repeatedly attempting to + * fill them until the writes are flushed. + */ + zero_user(page, 0, PAGE_CACHE_SIZE); + list_del(&page->lru); + lru_cache_add_file(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } else { + /* no need to hold page hostage */ + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + } + + /* issue the read if we have any iovecs left to fill */ + if (rdata->nr_iov > 1) { + length = cifs_readv_from_socket(server, &rdata->iov[1], + rdata->nr_iov - 1, len); + if (length < 0) + return length; + server->total_read += length; + } else { + length = 0; + } + + rdata->bytes = length; + + cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read, + rfclen, remaining); + + /* discard anything left over */ + if (server->total_read < rfclen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + return length; +} + +static void +cifs_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct page *page, *tpage; + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + + if (rdata->result == 0) { + kunmap(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + + unlock_page(page); + + if (rdata->result == 0) + cifs_readpage_to_fscache(rdata->mapping->host, page); + + page_cache_release(page); + } + cifs_readdata_free(rdata); +} + +static void +cifs_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__, + mid->mid, mid->midState, rdata->result, rdata->bytes); + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + /* result already set, check signature */ + if (server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (cifs_verify_signature(rdata->iov, rdata->nr_iov, + server, mid->sequence_number + 1)) + cERROR(1, "Unexpected SMB signature"); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->bytes); + cifs_stats_bytes_read(tcon, rdata->bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + break; + default: + rdata->result = -EIO; + } + + queue_work(system_nrt_wq, &rdata->work); + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +/* cifs_async_readv - send an async write, and set up mid to handle result */ +int +cifs_async_readv(struct cifs_readdata *rdata) +{ + int rc; + READ_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + + cFYI(1, "%s: offset=%llu bytes=%u", __func__, + rdata->offset, rdata->bytes); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((rdata->offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb); + if (rc) + return rc; + + smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = rdata->cfile->netfid; + smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); + if (wct == 12) + smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); + smb->Remaining = 0; + smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF); + smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16); + if (wct == 12) + smb->ByteCount = 0; + else { + /* old style read */ + struct smb_com_readx_req *smbr = + (struct smb_com_readx_req *)smb; + smbr->ByteCount = 0; + } + + /* 4 for RFC1001 length + 1 for BCC */ + rdata->iov[0].iov_base = smb; + rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, + cifs_readv_receive, cifs_readv_callback, + rdata, false); + + if (rc == 0) + cifs_stats_inc(&tcon->num_reads); + + cifs_small_buf_release(smb); + return rc; +} + int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *pbuf_type) @@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata) kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, - cifs_writev_callback, wdata, false); + NULL, cifs_writev_callback, wdata, false); if (rc == 0) cifs_stats_inc(&tcon->num_writes); @@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, return rc; } +int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; + struct kvec iov[2]; + int resp_buf_type; + __u16 count; + + cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->Timeout = 0; + pSMB->NumberOfLocks = cpu_to_le16(num_lock); + pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock); + pSMB->LockType = lock_type; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; /* netfid stays le */ + + count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 - + (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + iov[1].iov_base = (char *)buf; + iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + + cifs_stats_inc(&tcon->num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) + cFYI(1, "Send error in cifs_lockv = %d", rc); + + return rc; +} int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const __u64 len, + const __u16 smb_file_id, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level) @@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, pSMB->Fid = smb_file_id; /* netfid stays le */ if ((numLock != 0) || (numUnlock != 0)) { - pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); + pSMB->Locks[0].Pid = cpu_to_le16(netpid); /* BB where to store pid high? */ pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); @@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, const __u64 len, - struct file_lock *pLockData, const __u16 lock_type, - const bool waitFlag) + const __u16 smb_file_id, const __u32 netpid, const int get_flag, + const __u64 len, struct file_lock *pLockData, + const __u16 lock_type, const bool waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, } else pSMB->Timeout = 0; - parm_data->pid = cpu_to_le32(current->tgid); + parm_data->pid = cpu_to_le32(netpid); parm_data->start = cpu_to_le64(pLockData->fl_start); parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ @@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon, pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; @@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count, pSMB->Reserved = 0; pSMB->TotalParameterCount = cpu_to_le32(parm_len); pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->DataCount = pSMB->TotalDataCount; temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + @@ -3467,7 +3863,7 @@ qsec_out: int CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd *pntsd, __u32 acllen) + struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) { __u16 byte_count, param_count, data_count, param_offset, data_offset; int rc = 0; @@ -3504,7 +3900,7 @@ setCifsAclRetry: pSMB->Fid = fid; /* file handle always le */ pSMB->Reserved2 = 0; - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); + pSMB->AclFlags = cpu_to_le32(aclflag); if (pntsd && acllen) { memcpy((char *) &pSMBr->hdr.Protocol + data_offset, @@ -3977,8 +4373,7 @@ findFirstRetry: params = 12 + name_len /* includes null */ ; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(10); - pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4052,8 +4447,7 @@ findFirstRetry: psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -4097,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, byte_count = 0; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(8); - pSMB->MaxDataCount = - cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & - 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4181,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -5840,7 +6231,7 @@ QAllEAsRetry: if (ea_name) { if (ea_name_len == name_len && - strncmp(ea_name, temp_ptr, name_len) == 0) { + memcmp(ea_name, temp_ptr, name_len) == 0) { temp_ptr += name_len + 1; rc = value_len; if (buf_size == 0) @@ -6035,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, pSMB->TotalParameterCount = 0 ; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = 0; /* same in little endian or be */ -/* BB VERIFY verify which is correct for above BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 71beb020197..f3670cf7258 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -37,6 +37,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> +#include <linux/module.h> #include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" @@ -181,7 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server) -EINVAL = invalid transact2 */ -static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) +static int check2ndT2(struct smb_hdr *pSMB) { struct smb_t2_rsp *pSMBt; int remaining; @@ -214,9 +215,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) cFYI(1, "missing %d bytes from transact2, check next response", remaining); - if (total_data_size > maxBufSize) { + if (total_data_size > CIFSMaxBufSize) { cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, maxBufSize); + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -281,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); byte_count += total_in_buf2; /* don't allow buffer to overflow */ - if (byte_count > CIFSMaxBufSize) + if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) return -ENOBUFS; pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); @@ -320,27 +321,24 @@ requeue_echo: } static bool -allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, - bool is_large_buf) +allocate_buffers(struct TCP_Server_Info *server) { - char *bbuf = *bigbuf, *sbuf = *smallbuf; - - if (bbuf == NULL) { - bbuf = (char *)cifs_buf_get(); - if (!bbuf) { + if (!server->bigbuf) { + server->bigbuf = (char *)cifs_buf_get(); + if (!server->bigbuf) { cERROR(1, "No memory for large SMB response"); msleep(3000); /* retry will check if exiting */ return false; } - } else if (is_large_buf) { + } else if (server->large_buf) { /* we are reusing a dirty large buf, clear its start */ - memset(bbuf, 0, size); + memset(server->bigbuf, 0, sizeof(struct smb_hdr)); } - if (sbuf == NULL) { - sbuf = (char *)cifs_small_buf_get(); - if (!sbuf) { + if (!server->smallbuf) { + server->smallbuf = (char *)cifs_small_buf_get(); + if (!server->smallbuf) { cERROR(1, "No memory for SMB response"); msleep(1000); /* retry will check if exiting */ @@ -349,36 +347,118 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, /* beginning of smb buffer is cleared in our buf_get */ } else { /* if existing small buf clear beginning */ - memset(sbuf, 0, size); + memset(server->smallbuf, 0, sizeof(struct smb_hdr)); } - *bigbuf = bbuf; - *smallbuf = sbuf; - return true; } -static int -read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, - struct kvec *iov, unsigned int to_read, - unsigned int *ptotal_read, bool is_header_read) +static bool +server_unresponsive(struct TCP_Server_Info *server) +{ + if (echo_retries > 0 && server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + wake_up(&server->response_q); + return true; + } + + return false; +} + +/* + * kvec_array_init - clone a kvec array, and advance into it + * @new: pointer to memory for cloned array + * @iov: pointer to original array + * @nr_segs: number of members in original array + * @bytes: number of bytes to advance into the cloned array + * + * This function will copy the array provided in iov to a section of memory + * and advance the specified number of bytes into the new array. It returns + * the number of segments in the new array. "new" must be at least as big as + * the original iov array. + */ +static unsigned int +kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, + size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} + +static struct kvec * +get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) { - int length, rc = 0; - unsigned int total_read; - char *buf = iov->iov_base; + struct kvec *new_iov; + + if (server->iov && nr_segs <= server->nr_iov) + return server->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); + if (new_iov) { + kfree(server->iov); + server->iov = new_iov; + server->nr_iov = nr_segs; + } + return new_iov; +} + +int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) +{ + int length = 0; + int total_read; + unsigned int segs; + struct msghdr smb_msg; + struct kvec *iov; + + iov = get_server_iovec(server, nr_segs); + if (!iov) + return -ENOMEM; + + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + + for (total_read = 0; to_read; total_read += length, to_read -= length) { + try_to_freeze(); + + if (server_unresponsive(server)) { + total_read = -EAGAIN; + break; + } + + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(server->ssocket, &smb_msg, + iov, segs, to_read, 0); - for (total_read = 0; total_read < to_read; total_read += length) { - length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1, - to_read - total_read, 0); if (server->tcpStatus == CifsExiting) { - /* then will exit */ - rc = 2; + total_read = -ESHUTDOWN; break; } else if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - /* Reconnect wakes up rspns q */ - /* Now we will reread sock */ - rc = 1; + total_read = -EAGAIN; break; } else if (length == -ERESTARTSYS || length == -EAGAIN || @@ -390,56 +470,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, */ usleep_range(1000, 2000); length = 0; - if (!is_header_read) - continue; - /* Special handling for header read */ - if (total_read) { - iov->iov_base = (to_read - total_read) + - buf; - iov->iov_len = to_read - total_read; - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; - rc = 3; - } else - rc = 1; - break; + continue; } else if (length <= 0) { - cERROR(1, "Received no data, expecting %d", - to_read - total_read); + cFYI(1, "Received no data or error: expecting %d " + "got %d", to_read, length); cifs_reconnect(server); - rc = 1; + total_read = -EAGAIN; break; } } + return total_read; +} - *ptotal_read = total_read; - return rc; +int +cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) +{ + struct kvec iov; + + iov.iov_base = buf; + iov.iov_len = to_read; + + return cifs_readv_from_socket(server, &iov, 1, to_read); } static bool -check_rfc1002_header(struct TCP_Server_Info *server, char *buf) +is_smb_response(struct TCP_Server_Info *server, unsigned char type) { - char temp = *buf; - unsigned int pdu_length = be32_to_cpu( - ((struct smb_hdr *)buf)->smb_buf_length); - /* * The first byte big endian of the length field, * is actually not part of the length but the type * with the most common, zero, as regular data. */ - if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { - return false; - } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { - cFYI(1, "Good RFC 1002 session rsp"); - return false; - } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB response */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + cFYI(1, "RFC 1002 session keep alive"); + break; + case RFC1002_POSITIVE_SESSION_RESPONSE: + cFYI(1, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on * SMB negprot response. */ - cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", - pdu_length); + cFYI(1, "RFC 1002 negative session response"); /* give server a second to clean up */ msleep(1000); /* @@ -448,87 +526,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf) * is since we do not begin with RFC1001 session * initialize frame). */ - cifs_set_port((struct sockaddr *) - &server->dstaddr, CIFS_PORT); + cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); cifs_reconnect(server); wake_up(&server->response_q); - return false; - } else if (temp != (char) 0) { - cERROR(1, "Unknown RFC 1002 frame"); - cifs_dump_mem(" Received Data: ", buf, 4); - cifs_reconnect(server); - return false; - } - - /* else we have an SMB response */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - 4, pdu_length+4); + break; + default: + cERROR(1, "RFC 1002 unknown response type 0x%x", type); cifs_reconnect(server); - wake_up(&server->response_q); - return false; } - return true; + return false; } static struct mid_q_entry * -find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf) +find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) { - struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; + struct mid_q_entry *mid; spin_lock(&GlobalMid_Lock); - list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { - if (mid->mid != buf->Mid || - mid->midState != MID_REQUEST_SUBMITTED || - mid->command != buf->Command) - continue; - - if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) { - /* We have a multipart transact2 resp */ - *is_multi_rsp = true; - if (mid->resp_buf) { - /* merge response - fix up 1st*/ - *length = coalesce_t2(buf, mid->resp_buf); - if (*length > 0) { - *length = 0; - mid->multiRsp = true; - break; - } - /* All parts received or packet is malformed. */ - mid->multiEnd = true; - goto multi_t2_fnd; - } - if (!is_large_buf) { - /*FIXME: switch to already allocated largebuf?*/ - cERROR(1, "1st trans2 resp needs bigbuf"); - } else { - /* Have first buffer */ - mid->resp_buf = buf; - mid->largeBuf = true; - *bigbuf = NULL; - } - break; + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if (mid->mid == buf->Mid && + mid->midState == MID_REQUEST_SUBMITTED && + mid->command == buf->Command) { + spin_unlock(&GlobalMid_Lock); + return mid; } - mid->resp_buf = buf; - mid->largeBuf = is_large_buf; -multi_t2_fnd: - if (*length == 0) - mid->midState = MID_RESPONSE_RECEIVED; - else - mid->midState = MID_RESPONSE_MALFORMED; + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} + +void +dequeue_mid(struct mid_q_entry *mid, bool malformed) +{ #ifdef CONFIG_CIFS_STATS2 - mid->when_received = jiffies; + mid->when_received = jiffies; #endif - list_del_init(&mid->qhead); - ret = mid; - break; - } + spin_lock(&GlobalMid_Lock); + if (!malformed) + mid->midState = MID_RESPONSE_RECEIVED; + else + mid->midState = MID_RESPONSE_MALFORMED; + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); +} - return ret; +static void +handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, + struct smb_hdr *buf, int malformed) +{ + if (malformed == 0 && check2ndT2(buf) > 0) { + mid->multiRsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + malformed = coalesce_t2(buf, mid->resp_buf); + if (malformed > 0) + return; + + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + return dequeue_mid(mid, malformed); + } + if (!server->large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cERROR(1, "1st trans2 resp needs bigbuf"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->largeBuf = true; + server->bigbuf = NULL; + } + return; + } + mid->resp_buf = buf; + mid->largeBuf = server->large_buf; + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!mid->multiRsp) { + /* smb buffer will be freed by user thread */ + if (server->large_buf) + server->bigbuf = NULL; + else + server->smallbuf = NULL; + } + dequeue_mid(mid, malformed); } static void clean_demultiplex_info(struct TCP_Server_Info *server) @@ -618,6 +698,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } kfree(server->hostname); + kfree(server->iov); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -627,20 +708,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } static int +standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + char *buf = server->smallbuf; + struct smb_hdr *smb_buffer = (struct smb_hdr *)buf; + unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + /* make sure this will fit in a large buffer */ + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "SMB response too long (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -EAGAIN; + } + + /* switch to large buffer if too big for a small one */ + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + server->large_buf = true; + memcpy(server->bigbuf, server->smallbuf, server->total_read); + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } + + /* now read the rest */ + length = cifs_read_from_socket(server, + buf + sizeof(struct smb_hdr) - 1, + pdu_length - sizeof(struct smb_hdr) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + dump_smb(smb_buffer, server->total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, server->total_read, 48)); + + if (mid) + handle_mid(mid, server, smb_buffer, length); + + return length; +} + +static int cifs_demultiplex_thread(void *p) { int length; struct TCP_Server_Info *server = p; - unsigned int pdu_length, total_read; - char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; + unsigned int pdu_length; + char *buf = NULL; struct smb_hdr *smb_buffer = NULL; - struct msghdr smb_msg; - struct kvec iov; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; - bool isLargeBuf = false; - bool isMultiRsp = false; - int rc; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -655,111 +786,65 @@ cifs_demultiplex_thread(void *p) if (try_to_freeze()) continue; - if (!allocate_buffers(&bigbuf, &smallbuf, - sizeof(struct smb_hdr), isLargeBuf)) + if (!allocate_buffers(server)) continue; - isLargeBuf = false; - isMultiRsp = false; - smb_buffer = (struct smb_hdr *)smallbuf; - buf = smallbuf; - iov.iov_base = buf; - iov.iov_len = 4; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; + server->large_buf = false; + smb_buffer = (struct smb_hdr *)server->smallbuf; + buf = server->smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ -incomplete_rcv: - if (echo_retries > 0 && server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + - (echo_retries * SMB_ECHO_INTERVAL))) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (echo_retries * SMB_ECHO_INTERVAL / HZ)); - cifs_reconnect(server); - wake_up(&server->response_q); - continue; - } - - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, - &total_read, true /* header read */); - if (rc == 3) - goto incomplete_rcv; - else if (rc == 2) - break; - else if (rc == 1) + length = cifs_read_from_socket(server, buf, pdu_length); + if (length < 0) continue; + server->total_read = length; /* * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ - - /* - * Note that RFC 1001 length is big endian on the wire, - * but we convert it here so it is always manipulated - * as host byte order. - */ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); - cFYI(1, "rfc1002 length 0x%x", pdu_length+4); - if (!check_rfc1002_header(server, buf)) + cFYI(1, "RFC1002 header 0x%x", pdu_length); + if (!is_smb_response(server, buf[0])) continue; - /* else length ok */ - if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { - isLargeBuf = true; - memcpy(bigbuf, smallbuf, 4); - smb_buffer = (struct smb_hdr *)bigbuf; - buf = bigbuf; + /* make sure we have enough to get to the MID */ + if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) { + cERROR(1, "SMB response too short (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; } - iov.iov_base = 4 + buf; - iov.iov_len = pdu_length; - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, - &total_read, false); - if (rc == 2) - break; - else if (rc == 1) + /* read down to the MID */ + length = cifs_read_from_socket(server, buf + 4, + sizeof(struct smb_hdr) - 1 - 4); + if (length < 0) continue; + server->total_read += length; - total_read += 4; /* account for rfc1002 hdr */ + mid_entry = find_mid(server, smb_buffer); - dump_smb(smb_buffer, total_read); + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); - /* - * We know that we received enough to get to the MID as we - * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. - * - * 48 bytes is enough to display the header and a little bit - * into the payload for debugging purposes. - */ - length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); - if (length != 0) - cifs_dump_mem("Bad SMB: ", buf, - min_t(unsigned int, total_read, 48)); + if (length < 0) + continue; - server->lstrp = jiffies; + if (server->large_buf) { + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } - mid_entry = find_cifs_mid(server, smb_buffer, &length, - isLargeBuf, &isMultiRsp, &bigbuf); + server->lstrp = jiffies; if (mid_entry != NULL) { - mid_entry->callback(mid_entry); - /* Was previous buf put in mpx struct for multi-rsp? */ - if (!isMultiRsp) { - /* smb buffer will be freed by user thread */ - if (isLargeBuf) - bigbuf = NULL; - else - smallbuf = NULL; - } - } else if (length != 0) { - /* response sanity checks failed */ - continue; - } else if (!is_valid_oplock_break(smb_buffer, server) && - !isMultiRsp) { + if (!mid_entry->multiRsp || mid_entry->multiEnd) + mid_entry->callback(mid_entry); + } else if (!is_valid_oplock_break(smb_buffer, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, @@ -773,9 +858,9 @@ incomplete_rcv: } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ - cifs_buf_release(bigbuf); - if (smallbuf) /* no sense logging a debug message if NULL */ - cifs_small_buf_release(smallbuf); + cifs_buf_release(server->bigbuf); + if (server->smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(server->smallbuf); task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); @@ -827,6 +912,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, { char *value, *data, *end; char *mountdata_copy = NULL, *options; + int err; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -883,6 +969,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, cFYI(1, "Null separator not allowed"); } } + vol->backupuid_specified = false; /* no backup intent for a user */ + vol->backupgid_specified = false; /* no backup intent for a group */ while ((data = strsep(&options, separator)) != NULL) { if (!*data) @@ -1442,6 +1530,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->mfsymlinks = true; } else if (strnicmp(data, "multiuser", 8) == 0) { vol->multiuser = true; + } else if (!strnicmp(data, "backupuid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupuid); + if (err < 0) { + cERROR(1, "%s: Invalid backupuid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupuid_specified = true; + } else if (!strnicmp(data, "backupgid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupgid); + if (err < 0) { + cERROR(1, "%s: Invalid backupgid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupgid_specified = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -2018,7 +2122,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.2"); + "ntlmv2 in kernel release 3.3"); } ses->overrideSecFlg = volume_info->secFlg; @@ -2209,16 +2313,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) return 0; - if (old->rsize != new->rsize) - return 0; - /* - * We want to share sb only if we don't specify wsize or specified wsize - * is greater or equal than existing one. + * We want to share sb only if we don't specify an r/wsize or + * specified r/wsize is greater than or equal to existing one. */ if (new->wsize && new->wsize < old->wsize) return 0; + if (new->rsize && new->rsize < old->rsize) + return 0; + if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) return 0; @@ -2656,14 +2760,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, CIFS_MOUNT_POSIX_PATHS; } - if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { - if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - cifs_sb->rsize = 127 * 1024; - cFYI(DBG2, "larger reads not supported by srv"); - } - } - - cFYI(1, "Negotiate caps 0x%x", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) @@ -2708,31 +2804,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - if (pvolume_info->rsize > CIFSMaxBufSize) { - cERROR(1, "rsize %d too large, using MaxBufSize", - pvolume_info->rsize); - cifs_sb->rsize = CIFSMaxBufSize; - } else if ((pvolume_info->rsize) && - (pvolume_info->rsize <= CIFSMaxBufSize)) - cifs_sb->rsize = pvolume_info->rsize; - else /* default */ - cifs_sb->rsize = CIFSMaxBufSize; - - if (cifs_sb->rsize < 2048) { - cifs_sb->rsize = 2048; - /* Windows ME may prefer this */ - cFYI(1, "readsize set to minimum: 2048"); - } - /* - * Temporarily set wsize for matching superblock. If we end up using - * new sb then cifs_negotiate_wsize will later negotiate it downward - * if needed. + * Temporarily set r/wsize for matching superblock. If we end up using + * new sb then client will later negotiate it downward if needed. */ + cifs_sb->rsize = pvolume_info->rsize; cifs_sb->wsize = pvolume_info->wsize; cifs_sb->mnt_uid = pvolume_info->linux_uid; cifs_sb->mnt_gid = pvolume_info->linux_gid; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; cFYI(1, "file mode: 0x%x dir mode: 0x%x", @@ -2763,6 +2847,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; if (pvolume_info->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; if (pvolume_info->override_gid) @@ -2795,29 +2883,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, } /* - * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including - * the RFC1001 length. + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. * * Note that this might make for "interesting" allocation problems during * writeback however as we have to allocate an array of pointers for the * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. */ #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) /* - * When the server doesn't allow large posix writes, only allow a wsize of - * 128k minus the size of the WRITE_AND_X header. That allows for a write up - * to the maximum size described by RFC1002. + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill * a single wsize request with a single call. */ -#define CIFS_DEFAULT_WSIZE (1024 * 1024) +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60k reads. Default to that when posix + * extensions aren't in force. + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) @@ -2825,7 +2925,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : - CIFS_DEFAULT_WSIZE; + CIFS_DEFAULT_IOSIZE; /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -2848,6 +2948,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) return wsize; } +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * + * If the server advertises a MaxBufferSize of less than one page, + * assume that it also can't satisfy reads larger than that either. + * + * FIXME: Is there a better heuristic for this? + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else if (server->maxBuf >= PAGE_CACHE_SIZE) + defsize = CIFSMaxBufSize; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + static int is_path_accessible(int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -2877,9 +3021,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); - kfree(volume_info->UNC); if (volume_info->UNCip != volume_info->UNC + 2) kfree(volume_info->UNCip); + kfree(volume_info->UNC); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); @@ -3041,6 +3185,22 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +/* make sure ra_pages is a multiple of rsize */ +static inline unsigned int +cifs_ra_pages(struct cifs_sb_info *cifs_sb) +{ + unsigned int reads; + unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + + if (rsize_pages >= default_backing_dev_info.ra_pages) + return default_backing_dev_info.ra_pages; + else if (rsize_pages == 0) + return rsize_pages; + + reads = default_backing_dev_info.ra_pages / rsize_pages; + return reads * rsize_pages; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3059,8 +3219,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) if (rc) return rc; - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3125,15 +3283,11 @@ try_mount_again: CIFSSMBQFSAttributeInfo(xid, tcon); } - if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { - cifs_sb->rsize = 1024 * 127; - cFYI(DBG2, "no very large read support, rsize now 127K"); - } - if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) - cifs_sb->rsize = min(cifs_sb->rsize, - (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); - cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); + + /* tune readahead according to rsize */ + cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb); remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL @@ -3301,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, else #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr); + bcc_ptr, nls_codepage); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 72d448bf96c..d7eeb9d3ed6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } tcon = tlink_tcon(tlink); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; if (nd) @@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, @@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, { int rc = -EPERM; int xid; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *pTcon; @@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, return rc; } - /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + GENERIC_WRITE, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) @@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; - else + else { + /* + * Forcibly invalidate automounting directory inodes + * (remote DFS directories) so to have them + * instantiated again for automount + */ + if (IS_AUTOMOUNT(direntry->d_inode)) + return 0; return 1; + } } /* diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 55d87ac5200..9c7ecdccf2f 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -45,7 +45,7 @@ #include "cifs_debug.h" #include "cifsfs.h" -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ @@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = { .encode_fs = */ }; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9f41a10523a..4dd9283885e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -32,6 +32,7 @@ #include <linux/delay.h> #include <linux/mount.h> #include <linux/slab.h> +#include <linux/swap.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, int rc; int desiredAccess; int disposition; + int create_options = CREATE_NOT_DIR; FILE_ALL_INFO *buf; desiredAccess = cifs_convert_flags(f_flags); @@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (!buf) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + desiredAccess, create_options, pnetfid, poplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else @@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, pCifsFile->invalidHandle = false; pCifsFile->tlink = cifs_get_tlink(tlink); mutex_init(&pCifsFile->fh_mutex); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, spin_unlock(&cifs_file_list_lock); cifs_set_oplock_level(pCifsInode, oplock); + pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; file->private_data = pCifsFile; return pCifsFile; } +static void cifs_del_lock_waiters(struct cifsLockInfo *lock); + /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding @@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - mutex_lock(&cifs_file->lock_mutex); - list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + mutex_lock(&cifsi->lock_mutex); + list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) { + if (li->netfid != cifs_file->netfid) + continue; list_del(&li->llist); + cifs_del_lock_waiters(li); kfree(li); } - mutex_unlock(&cifs_file->lock_mutex); + mutex_unlock(&cifsi->lock_mutex); cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); @@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) char *full_path = NULL; int desiredAccess; int disposition = FILE_OPEN; + int create_options = CREATE_NOT_DIR; __u16 netfid; xid = GetXid(); @@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, pCifsFile->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + /* Can not refresh inode by passing in file_info buf to be returned by SMBOpen and then calling get_inode_info with returned buf since file might have write behind data that needs to be flushed @@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) that inode was not dirty locally we could do this */ rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { @@ -631,219 +644,713 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsFileInfo *fid, __u64 len, - __u64 offset, __u8 lockType) +static struct cifsLockInfo * +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) { - struct cifsLockInfo *li = + struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (li == NULL) - return -ENOMEM; - li->offset = offset; - li->length = len; - li->type = lockType; - mutex_lock(&fid->lock_mutex); - list_add(&li->llist, &fid->llist); - mutex_unlock(&fid->lock_mutex); - return 0; + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->netfid = netfid; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; } -int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) +static void +cifs_del_lock_waiters(struct cifsLockInfo *lock) { - int rc, xid; - __u32 numLock = 0; - __u32 numUnlock = 0; - __u64 length; - bool wait_flag = false; - struct cifs_sb_info *cifs_sb; + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, &lock->blist, blist) { + list_del_init(&li->blist); + wake_up(&li->block_q); + } +} + +static bool +__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, + __u64 length, __u8 type, __u16 netfid, + struct cifsLockInfo **conf_lock) +{ + struct cifsLockInfo *li, *tmp; + + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (offset + length <= li->offset || + offset >= li->offset + li->length) + continue; + else if ((type & LOCKING_ANDX_SHARED_LOCK) && + ((netfid == li->netfid && current->tgid == li->pid) || + type == li->type)) + continue; + else { + *conf_lock = li; + return true; + } + } + return false; +} + +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + struct cifsLockInfo **conf_lock) +{ + return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, + lock->type, lock->netfid, conf_lock); +} + +/* + * Check if there is another lock that prevents us to set the lock (mandatory + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, + __u8 type, __u16 netfid, struct file_lock *flock) +{ + int rc = 0; + struct cifsLockInfo *conf_lock; + bool exist; + + mutex_lock(&cinode->lock_mutex); + + exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); + if (exist) { + flock->fl_start = conf_lock->offset; + flock->fl_end = conf_lock->offset + conf_lock->length - 1; + flock->fl_pid = conf_lock->pid; + if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK) + flock->fl_type = F_RDLCK; + else + flock->fl_type = F_WRLCK; + } else if (!cinode->can_cache_brlcks) + rc = 1; + else + flock->fl_type = F_UNLCK; + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static void +cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) +{ + mutex_lock(&cinode->lock_mutex); + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); +} + +/* + * Set the byte-range lock (mandatory style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if no locks prevent us but we need to request to the server; + * 3) -EACCESS, if there is a lock that prevents us and wait is false. + */ +static int +cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + bool wait) +{ + struct cifsLockInfo *conf_lock; + bool exist; + int rc = 0; + +try_again: + exist = false; + mutex_lock(&cinode->lock_mutex); + + exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); + if (!exist && cinode->can_cache_brlcks) { + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); + return rc; + } + + if (!exist) + rc = 1; + else if (!wait) + rc = -EACCES; + else { + list_add_tail(&lock->blist, &conf_lock->blist); + mutex_unlock(&cinode->lock_mutex); + rc = wait_event_interruptible(lock->block_q, + (lock->blist.prev == &lock->blist) && + (lock->blist.next == &lock->blist)); + if (!rc) + goto try_again; + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +/* + * Check if there is another lock that prevents us to set the lock (posix + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_posix_lock_test(struct file *file, struct file_lock *flock) +{ + int rc = 0; + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + unsigned char saved_type = flock->fl_type; + + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + + mutex_lock(&cinode->lock_mutex); + posix_test_lock(file, flock); + + if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { + flock->fl_type = saved_type; + rc = 1; + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +/* + * Set the byte-range lock (posix style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if we need to request to the server; + * 3) <0, if the error occurs while setting the lock. + */ +static int +cifs_posix_lock_set(struct file *file, struct file_lock *flock) +{ + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + return rc; + } + rc = posix_lock_file_wait(file, flock); + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + int xid, rc = 0, stored_rc; + struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; - __u16 netfid; - __u8 lockType = LOCKING_ANDX_LARGE_FILES; - bool posix_locking = 0; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + unsigned int num, max_num; + LOCKING_ANDX_RANGE *buf, *cur; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + int i; - length = 1 + pfLock->fl_end - pfLock->fl_start; - rc = -EACCES; xid = GetXid(); + tcon = tlink_tcon(cfile->tlink); - cFYI(1, "Lock parm: 0x%x flockflags: " - "0x%x flocktype: 0x%x start: %lld end: %lld", - cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, - pfLock->fl_end); + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } - if (pfLock->fl_flags & FL_POSIX) + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (li->type != types[i]) + continue; + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, 0, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], 0, num, buf); + if (stored_rc) + rc = stored_rc; + } + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + kfree(buf); + FreeXid(xid); + return rc; +} + +/* copied from fs/locks.c with a name change */ +#define cifs_for_each_lock(inode, lockp) \ + for (lockp = &inode->i_flock; *lockp != NULL; \ + lockp = &(*lockp)->fl_next) + +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct file_lock *flock, **before; + struct cifsLockInfo *lck, *tmp; + int rc = 0, xid, type; + __u64 length; + struct list_head locks_to_send; + + xid = GetXid(); + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + INIT_LIST_HEAD(&locks_to_send); + + lock_flocks(); + cifs_for_each_lock(cfile->dentry->d_inode, before) { + flock = *before; + length = 1 + flock->fl_end - flock->fl_start; + if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + type = CIFS_RDLCK; + else + type = CIFS_WRLCK; + + lck = cifs_lock_init(flock->fl_start, length, type, + cfile->netfid); + if (!lck) { + rc = -ENOMEM; + goto send_locks; + } + lck->pid = flock->fl_pid; + + list_add_tail(&lck->llist, &locks_to_send); + } + +send_locks: + unlock_flocks(); + + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + struct file_lock tmp_lock; + int stored_rc; + + tmp_lock.fl_start = lck->offset; + stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid, + 0, lck->length, &tmp_lock, + lck->type, 0); + if (stored_rc) + rc = stored_rc; + list_del(&lck->llist); + kfree(lck); + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + FreeXid(xid); + return rc; +} + +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return cifs_push_posix_locks(cfile); + + return cifs_push_mandatory_locks(cfile); +} + +static void +cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, + bool *wait_flag) +{ + if (flock->fl_flags & FL_POSIX) cFYI(1, "Posix"); - if (pfLock->fl_flags & FL_FLOCK) + if (flock->fl_flags & FL_FLOCK) cFYI(1, "Flock"); - if (pfLock->fl_flags & FL_SLEEP) { + if (flock->fl_flags & FL_SLEEP) { cFYI(1, "Blocking lock"); - wait_flag = true; + *wait_flag = true; } - if (pfLock->fl_flags & FL_ACCESS) + if (flock->fl_flags & FL_ACCESS) cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); - if (pfLock->fl_flags & FL_LEASE) + "not implemented yet"); + if (flock->fl_flags & FL_LEASE) cFYI(1, "Lease on file - not implemented yet"); - if (pfLock->fl_flags & + if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) - cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); + cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); - if (pfLock->fl_type == F_WRLCK) { + *type = LOCKING_ANDX_LARGE_FILES; + if (flock->fl_type == F_WRLCK) { cFYI(1, "F_WRLCK "); - numLock = 1; - } else if (pfLock->fl_type == F_UNLCK) { + *lock = 1; + } else if (flock->fl_type == F_UNLCK) { cFYI(1, "F_UNLCK"); - numUnlock = 1; - /* Check if unlock includes more than - one lock range */ - } else if (pfLock->fl_type == F_RDLCK) { + *unlock = 1; + /* Check if unlock includes more than one lock range */ + } else if (flock->fl_type == F_RDLCK) { cFYI(1, "F_RDLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; - } else if (pfLock->fl_type == F_EXLCK) { + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; + } else if (flock->fl_type == F_EXLCK) { cFYI(1, "F_EXLCK"); - numLock = 1; - } else if (pfLock->fl_type == F_SHLCK) { + *lock = 1; + } else if (flock->fl_type == F_SHLCK) { cFYI(1, "F_SHLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; } else cFYI(1, "Unknown type of lock"); +} - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - netfid = ((struct cifsFileInfo *)file->private_data)->netfid; +static int +cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + __u16 netfid = cfile->netfid; - if ((tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - posix_locking = 1; - /* BB add code here to normalize offset and length to - account for negative length which we can not accept over the - wire */ - if (IS_GETLK(cmd)) { - if (posix_locking) { - int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) - posix_lock_type = CIFS_RDLCK; - else - posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, - length, pfLock, posix_lock_type, - wait_flag); - FreeXid(xid); + if (posix_lck) { + int posix_lock_type; + + rc = cifs_posix_lock_test(file, flock); + if (!rc) return rc; - } - /* BB we could chain these into one lock request BB */ - rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 1 /* numUnlock */ , - 0 /* numLock */ , lockType, - 0 /* wait flag */, 0); - pfLock->fl_type = F_UNLCK; - if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); - rc = 0; + if (type & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 1 /* get */, length, flock, + posix_lock_type, wait_flag); + return rc; + } - } else { - /* if rc == ERR_SHARING_VIOLATION ? */ - rc = 0; + rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid, + flock); + if (!rc) + return rc; + + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type, 0, 0); + flock->fl_type = F_UNLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + return 0; + } + + if (type & LOCKING_ANDX_SHARED_LOCK) { + flock->fl_type = F_WRLCK; + return 0; + } + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, + type | LOCKING_ANDX_SHARED_LOCK, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type | LOCKING_ANDX_SHARED_LOCK, + 0, 0); + flock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + } else + flock->fl_type = F_WRLCK; + + return 0; +} - if (lockType & LOCKING_ANDX_SHARED_LOCK) { - pfLock->fl_type = F_WRLCK; +static void +cifs_move_llist(struct list_head *source, struct list_head *dest) +{ + struct list_head *li, *tmp; + list_for_each_safe(li, tmp, source) + list_move(li, dest); +} + +static void +cifs_free_llist(struct list_head *llist) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, llist, llist) { + cifs_del_lock_waiters(li); + list_del(&li->llist); + kfree(li); + } +} + +static int +cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) +{ + int rc = 0, stored_rc; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + unsigned int i; + unsigned int max_num, num; + LOCKING_ANDX_RANGE *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + mutex_lock(&cinode->lock_mutex); + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cfile->netfid != li->netfid) + continue; + if (types[i] != li->type) + continue; + if (!cinode->can_cache_brlcks) { + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = + cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = + cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add + * it again to the inode list if the unlock + * range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->netfid, + li->type, num, + 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from + * the tmp list to the head of + * the inode list. + */ + cifs_move_llist(&tmp_llist, + &cinode->llist); + rc = stored_rc; + } else + /* + * The unlock range request + * succeed - free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; } else { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, 1, - lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, - length, pfLock->fl_start, 1, 0, - lockType | - LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - pfLock->fl_type = F_RDLCK; - if (rc != 0) - cERROR(1, "Error unlocking " - "previously locked range %d " - "during test of lock", rc); - rc = 0; - } else { - pfLock->fl_type = F_WRLCK; - rc = 0; - } + /* + * We can cache brlock requests - simply remove + * a lock from the inode list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); } } - - FreeXid(xid); - return rc; + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], num, 0, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cinode->llist); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } } - if (!numLock && !numUnlock) { - /* if no lock or unlock then nothing - to do since we do not know what it is */ - FreeXid(xid); - return -EOPNOTSUPP; - } + mutex_unlock(&cinode->lock_mutex); + kfree(buf); + return rc; +} + +static int +cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int lock, int unlock, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + __u16 netfid = cfile->netfid; - if (posix_locking) { + if (posix_lck) { int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) + + rc = cifs_posix_lock_set(file, flock); + if (!rc || rc < 0) + return rc; + + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - if (numUnlock == 1) + if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, - length, pfLock, posix_lock_type, - wait_flag); - } else { - struct cifsFileInfo *fid = file->private_data; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 0 /* set */, length, flock, + posix_lock_type, wait_flag); + goto out; + } - if (numLock) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, numLock, lockType, - wait_flag, 0); + if (lock) { + struct cifsLockInfo *lock; - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = store_file_lock(fid, length, - pfLock->fl_start, lockType); - } - } else if (numUnlock) { - /* For each stored lock that this unlock overlaps - completely, unlock it. */ - int stored_rc = 0; - struct cifsLockInfo *li, *tmp; + lock = cifs_lock_init(flock->fl_start, length, type, netfid); + if (!lock) + return -ENOMEM; - rc = 0; - mutex_lock(&fid->lock_mutex); - list_for_each_entry_safe(li, tmp, &fid->llist, llist) { - if (pfLock->fl_start <= li->offset && - (pfLock->fl_start + length) >= - (li->offset + li->length)) { - stored_rc = CIFSSMBLock(xid, tcon, - netfid, li->length, - li->offset, 1, 0, - li->type, false, 0); - if (stored_rc) - rc = stored_rc; - else { - list_del(&li->llist); - kfree(li); - } - } - } - mutex_unlock(&fid->lock_mutex); + rc = cifs_lock_add_if(cinode, lock, wait_flag); + if (rc < 0) + kfree(lock); + if (rc <= 0) + goto out; + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, wait_flag, 0); + if (rc) { + kfree(lock); + goto out; } + + cifs_lock_add(cinode, lock); + } else if (unlock) + rc = cifs_unlock_range(cfile, flock, xid); + +out: + if (flock->fl_flags & FL_POSIX) + posix_lock_file_wait(file, flock); + return rc; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *flock) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + __u16 netfid; + __u8 type; + + rc = -EACCES; + xid = GetXid(); + + cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " + "end: %lld", cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); + + cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag); + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + netfid = cfile->netfid; + cinode = CIFS_I(file->f_path.dentry->d_inode); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + /* + * BB add code here to normalize offset and length to account for + * negative length which we can not accept over the wire. + */ + if (IS_GETLK(cmd)) { + rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid); + FreeXid(xid); + return rc; } - if (pfLock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, pfLock); + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + FreeXid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, + xid); FreeXid(xid); return rc; } @@ -1714,6 +2221,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct smb_com_read_rsp *pSMBr; struct cifs_io_parms io_parms; char *read_data; + unsigned int rsize; __u32 pid; if (!nr_segs) @@ -1726,6 +2234,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + open_file = file->private_data; pTcon = tlink_tcon(open_file->tlink); @@ -1738,7 +2249,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cFYI(1, "attempting read on write only file instance"); for (total_read = 0; total_read < len; total_read += bytes_read) { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + cur_len = min_t(const size_t, len - total_read, rsize); rc = -EAGAIN; read_data = NULL; @@ -1830,6 +2341,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, unsigned int bytes_read = 0; unsigned int total_read; unsigned int current_read_size; + unsigned int rsize; struct cifs_sb_info *cifs_sb; struct cifs_tcon *pTcon; int xid; @@ -1842,6 +2354,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + if (file->private_data == NULL) { rc = -EBADF; FreeXid(xid); @@ -1861,14 +2376,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, for (total_read = 0, current_offset = read_data; read_size > total_read; total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(const int, read_size - total_read, - cifs_sb->rsize); + current_read_size = min_t(uint, read_size - total_read, rsize); + /* For windows me and 9x we do not want to request more than it negotiated since it will refuse the read then */ if ((pTcon->ses) && !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { - current_read_size = min_t(const int, current_read_size, - pTcon->ses->server->maxBuf - 128); + current_read_size = min_t(uint, current_read_size, + CIFSMaxBufSize); } rc = -EAGAIN; while (rc == -EAGAIN) { @@ -1957,82 +2472,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } - -static void cifs_copy_cache_pages(struct address_space *mapping, - struct list_head *pages, int bytes_read, char *data) -{ - struct page *page; - char *target; - - while (bytes_read > 0) { - if (list_empty(pages)) - break; - - page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); - - if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL)) { - page_cache_release(page); - cFYI(1, "Add page cache failed"); - data += PAGE_CACHE_SIZE; - bytes_read -= PAGE_CACHE_SIZE; - continue; - } - page_cache_release(page); - - target = kmap_atomic(page, KM_USER0); - - if (PAGE_CACHE_SIZE > bytes_read) { - memcpy(target, data, bytes_read); - /* zero the tail end of this partial page */ - memset(target + bytes_read, 0, - PAGE_CACHE_SIZE - bytes_read); - bytes_read = 0; - } else { - memcpy(target, data, PAGE_CACHE_SIZE); - bytes_read -= PAGE_CACHE_SIZE; - } - kunmap_atomic(target, KM_USER0); - - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - data += PAGE_CACHE_SIZE; - - /* add page to FS-Cache */ - cifs_readpage_to_fscache(mapping->host, page); - } - return; -} - static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { - int rc = -EACCES; - int xid; - loff_t offset; - struct page *page; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *pTcon; - unsigned int bytes_read = 0; - unsigned int read_size, i; - char *smb_read_data = NULL; - struct smb_com_read_rsp *pSMBr; - struct cifsFileInfo *open_file; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - __u32 pid; + int rc; + struct list_head tmplist; + struct cifsFileInfo *open_file = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + unsigned int rsize = cifs_sb->rsize; + pid_t pid; - xid = GetXid(); - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } - open_file = file->private_data; - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = tlink_tcon(open_file->tlink); + /* + * Give up immediately if rsize is too small to read an entire page. + * The VFS will fall back to readpage. We should never reach this + * point however since we set ra_pages to 0 when the rsize is smaller + * than a cache page. + */ + if (unlikely(rsize < PAGE_CACHE_SIZE)) + return 0; /* * Reads as many pages as possible from fscache. Returns -ENOBUFS @@ -2041,125 +2498,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); if (rc == 0) - goto read_complete; + return rc; - cFYI(DBG2, "rpages: num pages %d", num_pages); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - for (i = 0; i < num_pages; ) { - unsigned contig_pages; - struct page *tmp_page; - unsigned long expected_index; + rc = 0; + INIT_LIST_HEAD(&tmplist); - if (list_empty(page_list)) - break; + cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, + mapping, num_pages); + + /* + * Start with the page at end of list and move it to private + * list. Do the same with any following pages until we hit + * the rsize limit, hit an index discontinuity, or run out of + * pages. Issue the async read and then start the loop again + * until the list is empty. + * + * Note that list order is important. The page_list is in + * the order of declining indexes. When we put the pages in + * the rdata->pages, then we want them in increasing order. + */ + while (!list_empty(page_list)) { + unsigned int bytes = PAGE_CACHE_SIZE; + unsigned int expected_index; + unsigned int nr_pages = 1; + loff_t offset; + struct page *page, *tpage; + struct cifs_readdata *rdata; page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + break; + } + + /* move first page to the tmplist */ offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + list_move_tail(&page->lru, &tmplist); - /* count adjacent pages that we will read into */ - contig_pages = 0; - expected_index = - list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(tmp_page, page_list, lru) { - if (tmp_page->index == expected_index) { - contig_pages++; - expected_index++; - } else + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) break; + + /* would this page push the read over the rsize? */ + if (bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, &tmplist); + bytes += PAGE_CACHE_SIZE; + expected_index++; + nr_pages++; } - if (contig_pages + i > num_pages) - contig_pages = num_pages - i; - - /* for reads over a certain size could initiate async - read ahead */ - - read_size = contig_pages * PAGE_CACHE_SIZE; - /* Read size needs to be in multiples of one page */ - read_size = min_t(const unsigned int, read_size, - cifs_sb->rsize & PAGE_CACHE_MASK); - cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", - read_size, contig_pages); - rc = -EAGAIN; - while (rc == -EAGAIN) { + + rdata = cifs_readdata_alloc(nr_pages); + if (!rdata) { + /* best to give up if we're out of mem */ + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + rc = -ENOMEM; + break; + } + + spin_lock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + rdata->cfile = open_file; + rdata->mapping = mapping; + rdata->offset = offset; + rdata->bytes = bytes; + rdata->pid = pid; + list_splice_init(&tmplist, &rdata->pages); + + do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) - break; + continue; } - io_parms.netfid = open_file->netfid; - io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = offset; - io_parms.length = read_size; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, - &smb_read_data, &buf_type); - /* BB more RC checks ? */ - if (rc == -EAGAIN) { - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - } - } - if ((rc < 0) || (smb_read_data == NULL)) { - cFYI(1, "Read error in readpages: %d", rc); - break; - } else if (bytes_read > 0) { - task_io_account_read(bytes_read); - pSMBr = (struct smb_com_read_rsp *)smb_read_data; - cifs_copy_cache_pages(mapping, page_list, bytes_read, - smb_read_data + 4 /* RFC1001 hdr */ + - le16_to_cpu(pSMBr->DataOffset)); - - i += bytes_read >> PAGE_CACHE_SHIFT; - cifs_stats_bytes_read(pTcon, bytes_read); - if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) { - i++; /* account for partial page */ - - /* server copy of file can have smaller size - than client */ - /* BB do we need to verify this common case ? - this case is ok - if we are at server EOF - we will hit it on next read */ + rc = cifs_async_readv(rdata); + } while (rc == -EAGAIN); - /* break; */ + if (rc != 0) { + list_for_each_entry_safe(page, tpage, &rdata->pages, + lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); } - } else { - cFYI(1, "No bytes read (%d) at offset %lld . " - "Cleaning remaining pages from readahead list", - bytes_read, offset); - /* BB turn off caching and do new lookup on - file size at server? */ + cifs_readdata_free(rdata); break; } - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - bytes_read = 0; } -/* need to free smb_read_data buf before exit */ - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - -read_complete: - FreeXid(xid); return rc; } @@ -2408,6 +2867,10 @@ void cifs_oplock_break(struct work_struct *work) cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } + rc = cifs_push_locks(cfile); + if (rc) + cERROR(1, "Push locks rc = %d", rc); + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -2415,8 +2878,9 @@ void cifs_oplock_break(struct work_struct *work) * disconnected since oplock already released by the server */ if (!cfile->oplock_break_cancelled) { - rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, - 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, + current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a7b2dcd4a53..e851d5b8931 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; - inode->i_nlink = fattr->cf_nlink; + set_nlink(inode, fattr->cf_nlink); inode->i_uid = fattr->cf_uid; inode->i_gid = fattr->cf_gid; @@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp) xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); - if (rc == -EOPNOTSUPP || rc == -EINVAL) { + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: /* * FIXME: legacy server -- fall back to path-based call? * for now, just skip revalidating and mark inode for @@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp) */ rc = 0; CIFS_I(inode)->time = 0; + default: goto cgfi_exit; - } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); - rc = 0; - } else if (rc) - goto cgfi_exit; + } /* * don't bother with SFU junk here -- just mark inode as needing * revalidation. */ - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; cifs_fattr_to_inode(inode, &fattr); @@ -900,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb) if (rc && tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &cifs_ipc_inode_ops; inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; @@ -1362,7 +1367,7 @@ mkdir_get_info: /* setting nlink not necessary except in cases where we * failed to get it from the server or was set bogus */ if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) - direntry->d_inode->i_nlink = 2; + set_nlink(direntry->d_inode, 2); mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ @@ -2096,6 +2101,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { int xid; + uid_t uid = NO_CHANGE_32; + gid_t gid = NO_CHANGE_32; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } - /* - * Without unix extensions we can't send ownership changes to the - * server, so silently ignore them. This is consistent with how - * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With - * CIFSACL support + proper Windows to Unix idmapping, we may be - * able to support this in the future. - */ + if (attrs->ia_valid & ATTR_UID) + uid = attrs->ia_uid; + + if (attrs->ia_valid & ATTR_GID) + gid = attrs->ia_gid; + +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, + uid, gid); + if (rc) { + cFYI(1, "%s: Setting id failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } + } else +#endif /* CONFIG_CIFS_ACL */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); @@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid &= ~ATTR_MODE; if (attrs->ia_valid & ATTR_MODE) { - cFYI(1, "Mode changed to 0%o", attrs->ia_mode); mode = attrs->ia_mode; - } - - if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = mode_to_cifs_acl(inode, full_path, mode); + rc = id_mode_to_cifs_acl(inode, full_path, mode, + NO_CHANGE_32, NO_CHANGE_32); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index db3f18cdf02..6b0e0643439 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) static int CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; + int remap; + int create_options = CREATE_NOT_DIR; __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; struct cifs_io_parms io_parms; + struct nls_table *nls_codepage; + + nls_codepage = cifs_sb->local_nls; + remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) @@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, return rc; } + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, nls_codepage, remap); if (rc != 0) { kfree(buf); @@ -424,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, if (old_file->d_inode) { cifsInode = CIFS_I(old_file->d_inode); if (rc == 0) { - old_file->d_inode->i_nlink++; + inc_nlink(old_file->d_inode); /* BB should we make this contingent on superblock flag NOATIME? */ /* old_file->d_inode->i_ctime = CURRENT_TIME;*/ /* parent dir timestamps will update from srv @@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7c169339259..703ef5c6fdb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) } int -checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) +checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) { - __u32 len = be32_to_cpu(smb->smb_buf_length); + __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); + cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", + total_read, rfclen); - if (length < 2 + sizeof(struct smb_hdr)) { - if ((length >= sizeof(struct smb_hdr) - 1) + /* is this frame too small to even get to a BCC? */ + if (total_read < 2 + sizeof(struct smb_hdr)) { + if ((total_read >= sizeof(struct smb_hdr) - 1) && (smb->Status.CifsError != 0)) { + /* it's an error return */ smb->WordCount = 0; /* some error cases do not return wct and bcc */ return 0; - } else if ((length == sizeof(struct smb_hdr) + 1) && + } else if ((total_read == sizeof(struct smb_hdr) + 1) && (smb->WordCount == 0)) { char *tmp = (char *)smb; /* Need to work around a bug in two servers here */ @@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) } else { cERROR(1, "Length less than smb header size"); } - return 1; - } - if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "smb length greater than MaxBufSize, mid=%d", - smb->Mid); - return 1; + return -EIO; } + /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb, mid)) - return 1; + return -EIO; clc_len = smbCalcSize(smb); - if (4 + len != length) { + if (4 + rfclen != total_read) { cERROR(1, "Length read does not match RFC1001 length %d", - len); - return 1; + rfclen); + return -EIO; } - if (4 + len != clc_len) { + if (4 + rfclen != clc_len) { /* check if bcc wrapped around for large read responses */ - if ((len > 64 * 1024) && (len > clc_len)) { + if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ - if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + len, smb->Mid); + clc_len, 4 + rfclen, smb->Mid); - if (4 + len < clc_len) { + if (4 + rfclen < clc_len) { cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - len, smb->Mid); - return 1; - } else if (len > clc_len + 512) { + rfclen, smb->Mid); + return -EIO; + } else if (rfclen > clc_len + 512) { /* * Some servers (Windows XP in particular) send more * data than the lengths in the SMB packet would @@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) * data to 512 bytes. */ cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", len, smb->Mid); - return 1; + "than SMB for mid=%u", rfclen, smb->Mid); + return -EIO; } } return 0; @@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->clientCanCacheRead = false; } } + +bool +backup_cred(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + if (cifs_sb->mnt_backupuid == current_fsuid()) + return true; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (in_group_p(cifs_sb->mnt_backupgid)) + return true; + } + + return false; +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5de03ec2014..a090bbe6ee2 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, rc); return rc; } - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); } while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && @@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, cFYI(1, "calling findnext2"); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); if (rc) return -ENOENT; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d3e619692ee..4ec3ee9d72c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) /* that we use in next few lines */ /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, + USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); pSMB->req.VcNumber = get_next_vcnum(ses); @@ -681,7 +683,7 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses); + rc = setup_ntlm_response(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLM authentication", rc); goto ssetup_exit; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 42b9fff4875..80d85088193 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -199,160 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) return rc; } -/* Routines for Windows NT MD4 Hash functions. */ -static int -_my_wcslen(__u16 *str) -{ - int len = 0; - while (*str++ != 0) - len++; - return len; -} - -/* - * Convert a string into an NT UNICODE string. - * Note that regardless of processor type - * this must be in intel (little-endian) - * format. - */ - -static int -_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) -{ /* BB not a very good conversion routine - change/fix */ - int i; - __u16 val; - - for (i = 0; i < len; i++) { - val = *src; - SSVAL(dst, 0, val); - dst++; - src++; - if (val == 0) - break; - } - return i; -} - /* * Creates the MD4 Hash of the users password in NT UNICODE. */ int -E_md4hash(const unsigned char *passwd, unsigned char *p16) +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) { int rc; int len; - __u16 wpwd[129]; + __le16 wpwd[129]; /* Password cannot be longer than 128 characters */ - if (passwd) { - len = strlen((char *) passwd); - if (len > 128) - len = 128; - - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - } else + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + else { len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } - wpwd[len] = 0; /* Ensure string is null terminated */ - /* Calculate length in bytes */ - len = _my_wcslen(wpwd) * sizeof(__u16); - - rc = mdfour(p16, (unsigned char *) wpwd, len); - memset(wpwd, 0, 129 * 2); + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); + memset(wpwd, 0, 129 * sizeof(__le16)); return rc; } -#if 0 /* currently unused */ -/* Does both the NT and LM owfs of a user's password */ -static void -nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) -{ - char passwd[514]; - - memset(passwd, '\0', 514); - if (strlen(pwd) < 513) - strcpy(passwd, pwd); - else - memcpy(passwd, pwd, 512); - /* Calculate the MD4 hash (NT compatible) of the password */ - memset(nt_p16, '\0', 16); - E_md4hash(passwd, nt_p16); - - /* Mangle the passwords into Lanman format */ - passwd[14] = '\0'; -/* strupper(passwd); */ - - /* Calculate the SMB (lanman) hash functions of the password */ - - memset(p16, '\0', 16); - E_P16((unsigned char *) passwd, (unsigned char *) p16); - - /* clear out local copy of user's password (just being paranoid). */ - memset(passwd, '\0', sizeof(passwd)); -} -#endif - -/* Does the NTLMv2 owfs of a user's password */ -#if 0 /* function not needed yet - but will be soon */ -static void -ntv2_owf_gen(const unsigned char owf[16], const char *user_n, - const char *domain_n, unsigned char kr_buf[16], - const struct nls_table *nls_codepage) -{ - wchar_t *user_u; - wchar_t *dom_u; - int user_l, domain_l; - struct HMACMD5Context ctx; - - /* might as well do one alloc to hold both (user_u and dom_u) */ - user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL); - if (user_u == NULL) - return; - dom_u = user_u + 1024; - - /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); - push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */ - - /* BB user and domain may need to be uppercased */ - user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage); - domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage); - - user_l++; /* trailing null */ - domain_l++; - - hmac_md5_init_limK_to_64(owf, 16, &ctx); - hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx); - hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); - hmac_md5_final(kr_buf, &ctx); - - kfree(user_u); -} -#endif - -/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ -#if 0 /* currently unused */ -static void -NTLMSSPOWFencrypt(unsigned char passwd[8], - unsigned char *ntlmchalresp, unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - memcpy(p21, passwd, 8); - memset(p21 + 8, 0xbd, 8); - - E_P24(p21, ntlmchalresp, p24); -} -#endif - /* Does the NT MD4 hash then des encryption. */ int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) { int rc; unsigned char p16[16], p21[21]; @@ -360,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p16); + rc = E_md4hash(passwd, p16, codepage); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -369,39 +245,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) rc = E_P24(p21, c8, p24); return rc; } - - -/* Does the md5 encryption from the NT hash for NTLMv2. */ -/* These routines will be needed later */ -#if 0 -static void -SMBOWFencrypt_ntv2(const unsigned char kr[16], - const struct data_blob *srv_chal, - const struct data_blob *cli_chal, unsigned char resp_buf[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); - hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); - hmac_md5_final(resp_buf, &ctx); -} - -static void -SMBsesskeygen_ntv2(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(nt_resp, 16, &ctx); - hmac_md5_final((unsigned char *) sess_key, &ctx); -} - -static void -SMBsesskeygen_ntv1(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); -} -#endif diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 10ca6b2c26b..0cc9584f588 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <linux/net.h> #include <linux/delay.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> @@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_killable(server->response_q, + error = wait_event_freezekillable(server->response_q, midQ->midState != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; @@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) */ int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, void *cbdata, - bool ignore_pend) + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, bool ignore_pend) { int rc; struct mid_q_entry *mid; @@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, goto out_err; } + mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; mid->midState = MID_REQUEST_SUBMITTED; @@ -496,13 +498,18 @@ int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - dump_smb(mid->resp_buf, - min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); + unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + struct kvec iov; + + iov.iov_base = mid->resp_buf; + iov.iov_len = len; /* FIXME: add code to kill session */ - if (cifs_verify_signature(mid->resp_buf, server, + if (cifs_verify_signature(&iov, 1, server, mid->sequence_number + 1) != 0) cERROR(1, "Unexpected SMB signature"); } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 2a22fb2989e..45f07c46f3e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#include <linux/xattr.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -31,16 +32,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" -#define CIFS_XATTR_USER_PREFIX "user." -#define CIFS_XATTR_SYSTEM_PREFIX "system." -#define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX "security." -#define CIFS_XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_TRUSTED_PREFIX_LEN 8 -#define XATTR_SECURITY_PREFIX_LEN 9 -/* BB need to add server (Samba e.g) support for security and trusted prefix */ - +/* BB need to add server (Samba e.g) support for security and trusted prefix */ int cifs_removexattr(struct dentry *direntry, const char *ea_name) { @@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) } if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) - && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { cFYI(1, "illegal xattr request %s (only user namespace supported)", ea_name); @@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto remove_ea_exit; - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, (__u16)0, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) cFYI(1, "attempt to set cifs inode metadata"); - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #ifdef CONFIG_CIFS_ACL memcpy(pacl, ea_value, value_size); rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path); + direntry->d_inode, full_path, CIFS_ACL_DACL); if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return alt name if available as pseudo attr */ if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; @@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "attempt to query cifs inode metadata"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "Query CIFS ACL not supported yet"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, - CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); } else if (strncmp(ea_name, - CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { cFYI(1, "Security xattr namespace not supported yet"); } else cFYI(1, diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2bdbcc11b37..854ace71268 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_gid != -1) inode->i_gid = (gid_t) attr->va_gid; if (attr->va_nlink != -1) - inode->i_nlink = attr->va_nlink; + set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) inode->i_size = attr->va_size; if (attr->va_size != -1) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 44e17e9c21a..cc0ea9fe5ec 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -59,12 +59,11 @@ void coda_sysctl_clean(void); #define CODA_ALLOC(ptr, cast, size) do { \ if (size < PAGE_SIZE) \ - ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ else \ - ptr = (cast)vmalloc((unsigned long) size); \ + ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ - else memset( ptr, 0, size ); \ } while (0) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0239433f50c..28e7e135cfa 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) if (!error) { /* VFS may delete the child */ if (de->d_inode) - de->d_inode->i_nlink = 0; + clear_nlink(de->d_inode); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); diff --git a/fs/compat.c b/fs/compat.c index 58b1da45989..c98787536bb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> #include <linux/tsacct_kern.h> @@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(0, &ubuf->f_spare[0]) || - __put_user(0, &ubuf->f_spare[1]) || - __put_user(0, &ubuf->f_spare[2]) || - __put_user(0, &ubuf->f_spare[3]) || - __put_user(0, &ubuf->f_spare[4])) + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } @@ -550,7 +546,7 @@ out: ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, int check_access) { compat_ssize_t tot_len; struct iovec *iov = *ret_pointer = fast_pointer; @@ -597,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + if (check_access && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } @@ -1111,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, goto out; tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); + UIO_FASTIOV, iovstack, &iov, 1); if (tot_len == 0) { ret = 0; goto out; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c83f4768eea..9d8715c45f2 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -23,7 +23,8 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs.txt for more information. + * Please see Documentation/filesystems/configfs/configfs.txt for more + * information. */ #undef DEBUG @@ -291,7 +292,7 @@ int __init configfs_inode_init(void) return bdi_init(&configfs_backing_dev_info); } -void __exit configfs_inode_exit(void) +void configfs_inode_exit(void) { bdi_destroy(&configfs_backing_dev_info); } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 76dc4c3e5d5..50cee7f9110 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -23,7 +23,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs.txt for + * Please see the file Documentation/filesystems/configfs/configfs.txt for * critical information about using the config_item interface. */ diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index ecc62178bed..276e15cafd5 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -143,28 +143,26 @@ static int __init configfs_init(void) goto out; config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) { - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (!config_kobj) + goto out2; + + err = configfs_inode_init(); + if (err) + goto out3; err = register_filesystem(&configfs_fs_type); - if (err) { - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (err) + goto out4; - err = configfs_inode_init(); - if (err) { - unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - } + return 0; +out4: + printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + configfs_inode_exit(); +out3: + kobject_put(config_kobj); +out2: + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; out: return err; } diff --git a/fs/dcache.c b/fs/dcache.c index a88948b8bd1..89509b5a090 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -36,6 +36,7 @@ #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> +#include <linux/ratelimit.h> #include "internal.h" /* @@ -225,7 +226,7 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|move_tail) must be called with d_lock held. + * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { @@ -245,6 +246,9 @@ static void __dentry_lru_del(struct dentry *dentry) dentry_stat.nr_unused--; } +/* + * Remove a dentry with references from the LRU. + */ static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { @@ -254,6 +258,23 @@ static void dentry_lru_del(struct dentry *dentry) } } +/* + * Remove a dentry that is unreferenced and about to be pruned + * (unhashed and destroyed) from the LRU, and inform the file system. + * This wrapper should be called _prior_ to unhashing a victim dentry. + */ +static void dentry_lru_prune(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); + } +} + static void dentry_lru_move_tail(struct dentry *dentry) { spin_lock(&dcache_lru_lock); @@ -403,8 +424,12 @@ relock: if (ref) dentry->d_count--; - /* if dentry was on the d_lru list delete it from there */ - dentry_lru_del(dentry); + /* + * if dentry was on the d_lru list delete it from there. + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -522,9 +547,11 @@ int d_invalidate(struct dentry * dentry) * would make it unreachable from the root, * we might still populate it if it was a * working directory or similar). + * We also need to leave mountpoints alone, + * directory or not. */ - if (dentry->d_count > 1) { - if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + if (dentry->d_count > 1 && dentry->d_inode) { + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; } @@ -854,8 +881,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) do { struct inode *inode; - /* detach from the system */ - dentry_lru_del(dentry); + /* + * remove the dentry from the lru, and inform + * the fs that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); __d_shrink(dentry); if (dentry->d_count != 0) { @@ -1283,6 +1314,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; } EXPORT_SYMBOL(d_set_d_op); @@ -2351,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); - if (IS_ERR(actual)) + if (IS_ERR(actual)) { + if (PTR_ERR(actual) == -ELOOP) + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); dput(alias); + } goto out_nolock; } } @@ -2398,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; @@ -2442,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2459,15 +2498,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2478,10 +2519,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2492,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2500,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2538,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2553,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2576,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2584,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2717,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e7a7a2f0732..f3a257d7a98 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,5 +1,5 @@ /* - * file.c - part of debugfs, a tiny little debug file system + * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 2f27e578d46..d5d5297efe9 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -307,7 +307,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); s->s_root = d_alloc_root(inode); if (s->s_root) @@ -549,7 +549,7 @@ void devpts_pty_kill(struct tty_struct *tty) dentry = d_find_alias(inode); - inode->i_nlink--; + drop_nlink(inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 44a360ca804..d740ab67ff6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,7 +39,7 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure in the slab cache */ #define DIO_PAGES 64 @@ -55,13 +55,10 @@ * blocksize. */ -struct dio { - /* BIO submission state */ +/* dio_state only used in the submission path */ + +struct dio_submit { struct bio *bio; /* bio under assembly */ - struct inode *inode; - int rw; - loff_t i_size; /* i_size when submitted */ - int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -76,18 +73,17 @@ struct dio { sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ + int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ - int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ - dio_iodone_t *end_io; /* IO completion function */ dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -100,18 +96,6 @@ struct dio { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* BIO completion state */ - spinlock_t bio_lock; /* protects BIO fields below */ - unsigned long refcount; /* direct_io_worker() and bios */ - struct bio *bio_list; /* singly linked via bi_private */ - struct task_struct *waiter; /* waiting task (NULL if none) */ - - /* AIO related stuff */ - struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ - ssize_t result; /* IO result */ - /* * Page fetching state. These variables belong to dio_refill_pages(). */ @@ -125,7 +109,30 @@ struct dio { */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ +}; + +/* dio_state communicated between submission path and end_io */ +struct dio { + int flags; /* doesn't change */ + int rw; + struct inode *inode; + loff_t i_size; /* i_size when submitted */ + dio_iodone_t *end_io; /* IO completion function */ + + void *private; /* copy from map_bh.b_private */ + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + ssize_t result; /* IO result */ /* * pages[] (and any fields placed after it) are not zeroed out at @@ -133,7 +140,9 @@ struct dio { * wish that they not be zeroed. */ struct page *pages[DIO_PAGES]; /* page buffer */ -}; +} ____cacheline_aligned_in_smp; + +static struct kmem_cache *dio_cache __read_mostly; static void __inode_dio_wait(struct inode *inode) { @@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done); /* * How many pages are in the queue? */ -static inline unsigned dio_pages_present(struct dio *dio) +static inline unsigned dio_pages_present(struct dio_submit *sdio) { - return dio->tail - dio->head; + return sdio->tail - sdio->head; } /* * Go grab and pin some userspace pages. Typically we'll get 64 at a time. */ -static int dio_refill_pages(struct dio *dio) +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { int ret; int nr_pages; - nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); + nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); ret = get_user_pages_fast( - dio->curr_user_address, /* Where from? */ + sdio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ &dio->pages[0]); /* Put results here */ - if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio) dio->page_errors = ret; page_cache_get(page); dio->pages[0] = page; - dio->head = 0; - dio->tail = 1; + sdio->head = 0; + sdio->tail = 1; ret = 0; goto out; } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; - dio->curr_page += ret; - dio->head = 0; - dio->tail = ret; + sdio->curr_user_address += ret * PAGE_SIZE; + sdio->curr_page += ret; + sdio->head = 0; + sdio->tail = ret; ret = 0; } out: @@ -236,17 +245,18 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static inline struct page *dio_get_page(struct dio *dio, + struct dio_submit *sdio) { - if (dio_pages_present(dio) == 0) { + if (dio_pages_present(sdio) == 0) { int ret; - ret = dio_refill_pages(dio); + ret = dio_refill_pages(dio, sdio); if (ret) return ERR_PTR(ret); - BUG_ON(dio_pages_present(dio) == 0); + BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[dio->head++]; + return dio->pages[sdio->head++]; } /** @@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private, ret, is_async); + dio->private, ret, is_async); } else { if (is_async) aio_complete(dio->iocb, ret, 0); @@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) if (remaining == 0) { dio_complete(dio, dio->iocb->ki_pos, 0, true); - kfree(dio); + kmem_cache_free(dio_cache, dio); } } @@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error) } EXPORT_SYMBOL_GPL(dio_end_io); -static void -dio_bio_alloc(struct dio *dio, struct block_device *bdev, - sector_t first_sector, int nr_vecs) +static inline void +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, + struct block_device *bdev, + sector_t first_sector, int nr_vecs) { struct bio *bio; @@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, else bio->bi_end_io = dio_bio_end_io; - dio->bio = bio; - dio->logical_offset_in_bio = dio->cur_page_fs_offset; + sdio->bio = bio; + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } /* @@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * * bios hold a dio reference between submit_bio and ->end_io. */ -static void dio_bio_submit(struct dio *dio) +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { - struct bio *bio = dio->bio; + struct bio *bio = sdio->bio; unsigned long flags; bio->bi_private = dio; @@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - if (dio->submit_io) - dio->submit_io(dio->rw, bio, dio->inode, - dio->logical_offset_in_bio); + if (sdio->submit_io) + sdio->submit_io(dio->rw, bio, dio->inode, + sdio->logical_offset_in_bio); else submit_bio(dio->rw, bio); - dio->bio = NULL; - dio->boundary = 0; - dio->logical_offset_in_bio = 0; + sdio->bio = NULL; + sdio->boundary = 0; + sdio->logical_offset_in_bio = 0; } /* * Release any resources in case of a failure */ -static void dio_cleanup(struct dio *dio) +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + while (dio_pages_present(sdio)) + page_cache_release(dio_get_page(dio, sdio)); } /* @@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio) * * This also helps to limit the peak amount of pinned userspace memory. */ -static int dio_bio_reap(struct dio *dio) +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) { int ret = 0; - if (dio->reap_counter++ >= 64) { + if (sdio->reap_counter++ >= 64) { while (dio->bio_list) { unsigned long flags; struct bio *bio; @@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio) if (ret == 0) ret = ret2; } - dio->reap_counter = 0; + sdio->reap_counter = 0; } return ret; } /* * Call into the fs to map some more disk blocks. We record the current number - * of available blocks at dio->blocks_available. These are in units of the + * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). * * The fs is allowed to map lots of blocks at once. If it wants to do that, @@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio) * buffer_mapped(). However the direct-io code will only process holes one * block at a time - it will repeatedly call get_block() as it walks the hole. */ -static int get_more_blocks(struct dio *dio) +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret; - struct buffer_head *map_bh = &dio->map_bh; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ @@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - BUG_ON(dio->block_in_file >= dio->final_block_in_request); - fs_startblk = dio->block_in_file >> dio->blkfactor; - dio_count = dio->final_block_in_request - dio->block_in_file; - fs_count = dio_count >> dio->blkfactor; - blkmask = (1 << dio->blkfactor) - 1; + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); + fs_startblk = sdio->block_in_file >> sdio->blkfactor; + dio_count = sdio->final_block_in_request - sdio->block_in_file; + fs_count = dio_count >> sdio->blkfactor; + blkmask = (1 << sdio->blkfactor) - 1; if (dio_count & blkmask) fs_count++; @@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio) */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (dio->block_in_file < (i_size_read(dio->inode) >> - dio->blkbits)) + if (sdio->block_in_file < (i_size_read(dio->inode) >> + sdio->blkbits)) create = 0; } - ret = (*dio->get_block)(dio->inode, fs_startblk, + ret = (*sdio->get_block)(dio->inode, fs_startblk, map_bh, create); + + /* Store for completion */ + dio->private = map_bh->b_private; } return ret; } @@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio) /* * There is no bio. Make one now. */ -static int dio_new_bio(struct dio *dio, sector_t start_sector) +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector, struct buffer_head *map_bh) { sector_t sector; int ret, nr_pages; - ret = dio_bio_reap(dio); + ret = dio_bio_reap(dio, sdio); if (ret) goto out; - sector = start_sector << (dio->blkbits - 9); - nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + sector = start_sector << (sdio->blkbits - 9); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); - dio->boundary = 0; + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); + sdio->boundary = 0; out: return ret; } @@ -643,21 +658,21 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static int dio_bio_add_page(struct dio *dio) +static inline int dio_bio_add_page(struct dio_submit *sdio) { int ret; - ret = bio_add_page(dio->bio, dio->cur_page, - dio->cur_page_len, dio->cur_page_offset); - if (ret == dio->cur_page_len) { + ret = bio_add_page(sdio->bio, sdio->cur_page, + sdio->cur_page_len, sdio->cur_page_offset); + if (ret == sdio->cur_page_len) { /* * Decrement count only, if we are done with this page */ - if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) - dio->pages_in_io--; - page_cache_get(dio->cur_page); - dio->final_block_in_bio = dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits); + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) + sdio->pages_in_io--; + page_cache_get(sdio->cur_page); + sdio->final_block_in_bio = sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits); ret = 0; } else { ret = 1; @@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio) +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret = 0; - if (dio->bio) { - loff_t cur_offset = dio->cur_page_fs_offset; - loff_t bio_next_offset = dio->logical_offset_in_bio + - dio->bio->bi_size; + if (sdio->bio) { + loff_t cur_offset = sdio->cur_page_fs_offset; + loff_t bio_next_offset = sdio->logical_offset_in_bio + + sdio->bio->bi_size; /* * See whether this new request is contiguous with the old. @@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio) * be the next logical offset in the bio, submit the bio we * have. */ - if (dio->final_block_in_bio != dio->cur_page_block || + if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) - dio_bio_submit(dio); + dio_bio_submit(dio, sdio); /* * Submit now if the underlying fs is about to perform a * metadata read */ - else if (dio->boundary) - dio_bio_submit(dio); + else if (sdio->boundary) + dio_bio_submit(dio, sdio); } - if (dio->bio == NULL) { - ret = dio_new_bio(dio, dio->cur_page_block); + if (sdio->bio == NULL) { + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret) goto out; } - if (dio_bio_add_page(dio) != 0) { - dio_bio_submit(dio); - ret = dio_new_bio(dio, dio->cur_page_block); + if (dio_bio_add_page(sdio) != 0) { + dio_bio_submit(dio, sdio); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(dio); + ret = dio_bio_add_page(sdio); BUG_ON(ret != 0); } } @@ -744,9 +760,10 @@ out: * If that doesn't work out then we put the old page into the bio and add this * page to the dio instead. */ -static int -submit_page_section(struct dio *dio, struct page *page, - unsigned offset, unsigned len, sector_t blocknr) +static inline int +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr, + struct buffer_head *map_bh) { int ret = 0; @@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * Can we just grow the current page's presence in the dio? */ - if ( (dio->cur_page == page) && - (dio->cur_page_offset + dio->cur_page_len == offset) && - (dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits) == blocknr)) { - dio->cur_page_len += len; + if (sdio->cur_page == page && + sdio->cur_page_offset + sdio->cur_page_len == offset && + sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { + sdio->cur_page_len += len; /* - * If dio->boundary then we want to schedule the IO now to + * If sdio->boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (dio->boundary) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; } goto out; } @@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * If there's a deferred page already there then send it. */ - if (dio->cur_page) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->cur_page) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; if (ret) goto out; } page_cache_get(page); /* It is in dio */ - dio->cur_page = page; - dio->cur_page_offset = offset; - dio->cur_page_len = len; - dio->cur_page_block = blocknr; - dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; + sdio->cur_page = page; + sdio->cur_page_offset = offset; + sdio->cur_page_len = len; + sdio->cur_page_block = blocknr; + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: return ret; } @@ -804,16 +821,16 @@ out: * file blocks. Only called for S_ISREG files - blockdevs do not set * buffer_new */ -static void clean_blockdev_aliases(struct dio *dio) +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) { unsigned i; unsigned nblocks; - nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; + nblocks = map_bh->b_size >> dio->inode->i_blkbits; for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(dio->map_bh.b_bdev, - dio->map_bh.b_blocknr + i); + unmap_underlying_metadata(map_bh->b_bdev, + map_bh->b_blocknr + i); } } @@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, int end) +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, + int end, struct buffer_head *map_bh) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ unsigned this_chunk_bytes; struct page *page; - dio->start_zero_done = 1; - if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + sdio->start_zero_done = 1; + if (!sdio->blkfactor || !buffer_new(map_bh)) return; - dio_blocks_per_fs_block = 1 << dio->blkfactor; - this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + dio_blocks_per_fs_block = 1 << sdio->blkfactor; + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); if (!this_chunk_blocks) return; @@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end) if (end) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; - this_chunk_bytes = this_chunk_blocks << dio->blkbits; + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; page = ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes, - dio->next_block_for_io)) + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, + sdio->next_block_for_io, map_bh)) return; - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; } /* @@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -static int do_direct_IO(struct dio *dio) +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { - const unsigned blkbits = dio->blkbits; + const unsigned blkbits = sdio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; - struct buffer_head *map_bh = &dio->map_bh; int ret = 0; /* The I/O can start at any block offset within the first page */ - block_in_page = dio->first_block_in_page; + block_in_page = sdio->first_block_in_page; - while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + while (sdio->block_in_file < sdio->final_block_in_request) { + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; @@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio) unsigned this_chunk_blocks; /* # of blocks */ unsigned u; - if (dio->blocks_available == 0) { + if (sdio->blocks_available == 0) { /* * Need to go and map some more disk */ unsigned long blkmask; unsigned long dio_remainder; - ret = get_more_blocks(dio); + ret = get_more_blocks(dio, sdio, map_bh); if (ret) { page_cache_release(page); goto out; @@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio) if (!buffer_mapped(map_bh)) goto do_holes; - dio->blocks_available = - map_bh->b_size >> dio->blkbits; - dio->next_block_for_io = - map_bh->b_blocknr << dio->blkfactor; + sdio->blocks_available = + map_bh->b_size >> sdio->blkbits; + sdio->next_block_for_io = + map_bh->b_blocknr << sdio->blkfactor; if (buffer_new(map_bh)) - clean_blockdev_aliases(dio); + clean_blockdev_aliases(dio, map_bh); - if (!dio->blkfactor) + if (!sdio->blkfactor) goto do_holes; - blkmask = (1 << dio->blkfactor) - 1; - dio_remainder = (dio->block_in_file & blkmask); + blkmask = (1 << sdio->blkfactor) - 1; + dio_remainder = (sdio->block_in_file & blkmask); /* * If we are at the start of IO and that IO @@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio) * on-disk */ if (!buffer_new(map_bh)) - dio->next_block_for_io += dio_remainder; - dio->blocks_available -= dio_remainder; + sdio->next_block_for_io += dio_remainder; + sdio->blocks_available -= dio_remainder; } do_holes: /* Handle holes */ @@ -961,7 +979,7 @@ do_holes: */ i_size_aligned = ALIGN(i_size_read(dio->inode), 1 << blkbits); - if (dio->block_in_file >= + if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); @@ -969,7 +987,7 @@ do_holes: } zero_user(page, block_in_page << blkbits, 1 << blkbits); - dio->block_in_file++; + sdio->block_in_file++; block_in_page++; goto next_block; } @@ -979,38 +997,41 @@ do_holes: * is finer than the underlying fs, go check to see if * we must zero out the start of this block. */ - if (unlikely(dio->blkfactor && !dio->start_zero_done)) - dio_zero_block(dio, 0); + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) + dio_zero_block(dio, sdio, 0, map_bh); /* * Work out, in this_chunk_blocks, how much disk we * can add to this page */ - this_chunk_blocks = dio->blocks_available; + this_chunk_blocks = sdio->blocks_available; u = (PAGE_SIZE - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; - u = dio->final_block_in_request - dio->block_in_file; + u = sdio->final_block_in_request - sdio->block_in_file; if (this_chunk_blocks > u) this_chunk_blocks = u; this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, - this_chunk_bytes, dio->next_block_for_io); + sdio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, sdio, page, + offset_in_page, + this_chunk_bytes, + sdio->next_block_for_io, + map_bh); if (ret) { page_cache_release(page); goto out; } - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; - dio->block_in_file += this_chunk_blocks; + sdio->block_in_file += this_chunk_blocks; block_in_page += this_chunk_blocks; - dio->blocks_available -= this_chunk_blocks; + sdio->blocks_available -= this_chunk_blocks; next_block: - BUG_ON(dio->block_in_file > dio->final_block_in_request); - if (dio->block_in_file == dio->final_block_in_request) + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); + if (sdio->block_in_file == sdio->final_block_in_request) break; } @@ -1022,135 +1043,10 @@ out: return ret; } -static ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, struct dio *dio) +static inline int drop_refcount(struct dio *dio) { - unsigned long user_addr; + int ret2; unsigned long flags; - int seg; - ssize_t ret = 0; - ssize_t ret2; - size_t bytes; - - dio->inode = inode; - dio->rw = rw; - dio->blkbits = blkbits; - dio->blkfactor = inode->i_blkbits - blkbits; - dio->block_in_file = offset >> blkbits; - - dio->get_block = get_block; - dio->end_io = end_io; - dio->submit_io = submit_io; - dio->final_block_in_bio = -1; - dio->next_block_for_io = -1; - - dio->iocb = iocb; - dio->i_size = i_size_read(inode); - - spin_lock_init(&dio->bio_lock); - dio->refcount = 1; - - /* - * In case of non-aligned buffers, we may need 2 more - * pages since we need to zero out first and last block. - */ - if (unlikely(dio->blkfactor)) - dio->pages_in_io = 2; - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->pages_in_io += - ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE - - user_addr/PAGE_SIZE); - } - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - dio->final_block_in_request = dio->block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - dio->head = 0; - dio->tail = 0; - dio->curr_page = 0; - - dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - dio->curr_user_address = user_addr; - - ret = do_direct_IO(dio); - - dio->result += iov[seg].iov_len - - ((dio->final_block_in_request - dio->block_in_file) << - blkbits); - - if (ret) { - dio_cleanup(dio); - break; - } - } /* end iovec loop */ - - if (ret == -ENOTBLK) { - /* - * The remaining part of the request will be - * be handled by buffered I/O when we return - */ - ret = 0; - } - /* - * There may be some unwritten disk at the end of a part-written - * fs-block-sized block. Go zero that now. - */ - dio_zero_block(dio, 1); - - if (dio->cur_page) { - ret2 = dio_send_cur_page(dio); - if (ret == 0) - ret = ret2; - page_cache_release(dio->cur_page); - dio->cur_page = NULL; - } - if (dio->bio) - dio_bio_submit(dio); - - /* - * It is possible that, we return short IO due to end of file. - * In that case, we need to release all the pages we got hold on. - */ - dio_cleanup(dio); - - /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if (rw == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* - * The only time we want to leave bios in flight is when a successful - * partial aio read or full aio write have been setup. In that case - * bio completion will call aio_complete. The only time it's safe to - * call aio_complete is when we return -EIOCBQUEUED, so we key on that. - * This had *better* be the only place that raises -EIOCBQUEUED. - */ - BUG_ON(ret == -EIOCBQUEUED); - if (dio->is_async && ret == 0 && dio->result && - ((rw & READ) || (dio->result == dio->size))) - ret = -EIOCBQUEUED; - - if (ret != -EIOCBQUEUED) - dio_await_completion(dio); /* * Sync will always be dropping the final ref and completing the @@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_irqsave(&dio->bio_lock, flags); ret2 = --dio->refcount; spin_unlock_irqrestore(&dio->bio_lock, flags); - - if (ret2 == 0) { - ret = dio_complete(dio, offset, ret, false); - kfree(dio); - } else - BUG_ON(ret != -EIOCBQUEUED); - - return ret; + return ret2; } /* @@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * expected that filesystem provide exclusion between new direct I/O * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, * but other filesystems need to take care of this on their own. + * + * NOTE: if you pass "sdio" to anything by pointer make sure that function + * is always inlined. Otherwise gcc is unable to split the structure into + * individual fields and will generate much worse code. This is important + * for the whole file. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; + struct dio_submit sdio = { 0, }; + unsigned long user_addr; + size_t bytes; + struct buffer_head map_bh = { 0, }; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw == READ && end == offset) return 0; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; @@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, end - 1); if (retval) { mutex_unlock(&inode->i_mutex); - kfree(dio); + kmem_cache_free(dio_cache, dio); goto out; } } @@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, - submit_io, dio); + retval = 0; + + dio->inode = inode; + dio->rw = rw; + sdio.blkbits = blkbits; + sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.block_in_file = offset >> blkbits; + + sdio.get_block = get_block; + dio->end_io = end_io; + sdio.submit_io = submit_io; + sdio.final_block_in_bio = -1; + sdio.next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(sdio.blkfactor)) + sdio.pages_in_io = 2; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.pages_in_io += + ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / + PAGE_SIZE - user_addr / PAGE_SIZE); + } + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.size += bytes = iov[seg].iov_len; + + /* Index into the first page of the first block */ + sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + sdio.final_block_in_request = sdio.block_in_file + + (bytes >> blkbits); + /* Page fetching state */ + sdio.head = 0; + sdio.tail = 0; + sdio.curr_page = 0; + + sdio.total_pages = 0; + if (user_addr & (PAGE_SIZE-1)) { + sdio.total_pages++; + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + } + sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + sdio.curr_user_address = user_addr; + + retval = do_direct_IO(dio, &sdio, &map_bh); + + dio->result += iov[seg].iov_len - + ((sdio.final_block_in_request - sdio.block_in_file) << + blkbits); + + if (retval) { + dio_cleanup(dio, &sdio); + break; + } + } /* end iovec loop */ + + if (retval == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + retval = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, &sdio, 1, &map_bh); + + if (sdio.cur_page) { + ssize_t ret2; + + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); + if (retval == 0) + retval = ret2; + page_cache_release(sdio.cur_page); + sdio.cur_page = NULL; + } + if (sdio.bio) + dio_bio_submit(dio, &sdio); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio, &sdio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (rw == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(retval == -EIOCBQUEUED); + if (dio->is_async && retval == 0 && dio->result && + ((rw & READ) || (dio->result == sdio.size))) + retval = -EIOCBQUEUED; + + if (retval != -EIOCBQUEUED) + dio_await_completion(dio); + + if (drop_refcount(dio) == 0) { + retval = dio_complete(dio, offset, retval, false); + kmem_cache_free(dio_cache, dio); + } else + BUG_ON(retval != -EIOCBQUEUED); out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); + +static __init int dio_init(void) +{ + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + return 0; +} +module_init(dio_init) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bde3b9..2a834255c75 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( /** * ecryptfs_new_file_context - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_inode: The eCryptfs inode * * If the crypto context for the file has not yet been established, * this is where we do that. Establishing a new crypto context @@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( * * Returns zero on success; non-zero otherwise */ -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_inode->i_sb)->mount_crypt_stat; int cipher_name_len; int rc = 0; @@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, char *virt, size_t virt_len) { int rc; - rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + rc = ecryptfs_write_lower(ecryptfs_inode, virt, 0, virt_len); if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " @@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, /** * ecryptfs_write_metadata - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode * * Write the file headers out. This will likely involve a userspace * callout, in which the session key is encrypted with one or more @@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; unsigned int order; char *virt; size_t virt_len; @@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, size); else - rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " @@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" /* We could either offset on every reverse map or just pad some 0x00's * at the front here */ -static const unsigned char filename_rev_map[] = { +static const unsigned char filename_rev_map[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ @@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ - 0x3D, 0x3E, 0x3F + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ }; /** diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b36c5572b3f..a9f29b12fbf 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); -__attribute__ ((format(printf, 1, 2))) +__printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; @@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct page *page); int ecryptfs_decrypt_page(struct page *page); -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98cf9ba..d3f95f941c4 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -139,6 +139,27 @@ out: return rc; } +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + struct kmem_cache *ecryptfs_file_info_cache; /** @@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 11f8582d721..32f90a3ae63 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, * it. It will also update the eCryptfs directory inode to mimic the * stat of the lower directory inode. * - * Returns zero on success; non-zero on error condition + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition */ -static int +static struct inode * ecryptfs_do_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode) { int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *inode; lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); - rc = PTR_ERR(lower_dir_dentry); + inode = ERR_CAST(lower_dir_dentry); goto out; } rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, @@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); goto out_lock; } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb); - if (rc) { - ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); + if (IS_ERR(inode)) goto out_lock; - } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: - return rc; + return inode; } /** @@ -219,26 +219,26 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc = 0; - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (S_ISDIR(ecryptfs_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); - rc = ecryptfs_new_file_context(ecryptfs_dentry); + rc = ecryptfs_new_file_context(ecryptfs_inode); if (rc) { ecryptfs_printk(KERN_ERR, "Error creating new file " "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry, - ecryptfs_dentry->d_inode); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry->d_name.name, rc); goto out; } - rc = ecryptfs_write_metadata(ecryptfs_dentry); + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + ecryptfs_put_lower_file(ecryptfs_inode); out: return rc; } @@ -269,18 +269,28 @@ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode, struct nameidata *nd) { + struct inode *ecryptfs_inode; int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(rc)) { + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); goto out; } /* At this point, a file exists on "disk"; we need to make sure * that this on disk file is prepared to be an ecryptfs file */ - rc = ecryptfs_initialize_file(ecryptfs_dentry); + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + drop_nlink(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + d_instantiate(ecryptfs_dentry, ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); out: return rc; } @@ -474,8 +484,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - old_dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + set_nlink(old_dentry->d_inode, + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); i_size_write(new_dentry->d_inode, file_size_save); out_lock: unlock_dir(lower_dir_dentry); @@ -499,8 +509,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) goto out_unlock; } fsstack_copy_attr_times(dir, lower_dir_inode); - dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + set_nlink(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); dentry->d_inode->i_ctime = dir->i_ctime; d_drop(dentry); out_unlock: @@ -565,7 +575,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); out: unlock_dir(lower_dir_dentry); if (!dentry->d_inode) @@ -588,7 +598,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (!rc && dentry->d_inode) clear_nlink(dentry->d_inode); fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 9c13412e6c9..bc84f365d75 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) efs_inode = (struct efs_dinode *) (bh->b_data + offset); inode->i_mode = be16_to_cpu(efs_inode->di_mode); - inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); + set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); inode->i_size = be32_to_cpu(efs_inode->di_size); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index fe047d966dc..828e750af23 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -70,6 +70,15 @@ * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv) + void *priv, + int depth) { int error, pwake = 0; unsigned long flags; @@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL); + return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) @@ -700,7 +711,7 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an evenpoll file */ +/* Fast test to see if the file is an eventpoll file */ static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; @@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file) ep = epi->ep; list_del_init(&epi->fllink); - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } @@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct rb_node *rbp; struct epitem *epi; - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, call_nests + 1); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { @@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" diff --git a/fs/exec.c b/fs/exec.c index 25dcbe5fc35..36254645b7c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); - if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&old_mm->oom_disable_count); - atomic_inc(&tsk->mm->oom_disable_count); - } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855a6c4..352ba149d23 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -13,7 +13,8 @@ # # ore module library -obj-$(CONFIG_ORE) += ore.o +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae414929..da42f32c49b 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,10 +1,17 @@ +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. config ORE tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + default SCSI_OSD_ULD config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD - select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442ec744..51f4b4c40f0 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -53,6 +53,10 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_dev { + struct ore_dev ored; + unsigned did; +}; /* * our extension to the in-memory superblock */ @@ -66,13 +70,9 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components comps; /* comps for the partition */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ + struct ore_components oc; /* comps for the partition */ }; /* @@ -86,7 +86,7 @@ struct exofs_i_info { uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ struct ore_comp one_comp; /* same component for all devices */ - struct ore_components comps; /* inode view of the device table */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; * bigger and that the device table repeats twice. * See: exofs_read_lookup_dev_table() */ -static inline void exofs_init_comps(struct ore_components *comps, +static inline void exofs_init_comps(struct ore_components *oc, struct ore_comp *one_comp, struct exofs_sb_info *sbi, osd_id oid) { @@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - comps->numdevs = sbi->comps.numdevs; - comps->single_comp = EC_SINGLE_COMP; - comps->comps = one_comp; + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; - comps->ods = sbi->comps.ods + first_dev; + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; } #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38fc234..f6dbf7768ce 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,11 +37,7 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -68,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol) return 0; if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; if (pcol->read_4_write) { ore_read(pcol->ios); @@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol) return 0; BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); - if (unlikely(ret)) goto err; @@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_write(ios); if (unlikely(ret)) { EXOFS_ERR("write_exec: ore_write() Failed\n"); @@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, @@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_on_disk_inode_layout *layout; int ret; - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); /* read the inode from the osd */ @@ -1032,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(fcb.i_mode); inode->i_uid = le32_to_cpu(fcb.i_uid); inode->i_gid = le32_to_cpu(fcb.i_gid); - inode->i_nlink = le16_to_cpu(fcb.i_links_count); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); @@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); @@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; @@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af8819..d271ad83720 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -23,77 +23,289 @@ */ #include <linux/slab.h> +#include <linux/module.h> #include <asm/div64.h> +#include <linux/lcm.h> -#include <scsi/osd_ore.h> +#include "ore_raid.h" -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); -MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); -MODULE_LICENSE("GPL"); + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { - return ios->comps->comps[index & ios->comps->single_comp].cred; + return ios->oc->comps[index & ios->oc->single_comp].cred; } static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) { - return &ios->comps->comps[index & ios->comps->single_comp].obj; + return &ios->oc->comps[index & ios->oc->single_comp].obj; } static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->comps->ods[index]; + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, bool is_reading, u64 offset, u64 length, struct ore_io_state **pios) { struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(comps->numdevs)); - *pios = NULL; - return -ENOMEM; + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs); + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } } - ios->layout = layout; - ios->comps = comps; - ios->offset = offset; - ios->length = length; + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } - *pios = ios; return 0; } EXPORT_SYMBOL(ore_get_rw_state); -int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, - struct ore_io_state **ios) +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) { - return ore_get_rw_state(layout, comps, true, 0, 0, ios); + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -111,6 +323,7 @@ void ore_put_io_state(struct ore_io_state *ios) bio_put(per_dev->bio); } + _ore_free_raid_stuff(ios); kfree(ios); } } @@ -138,7 +351,7 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static int ore_io_execute(struct ore_io_state *ios) +int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -198,7 +411,7 @@ static void _clear_bio(struct bio *bio) } } -int ore_check_io(struct ore_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -206,7 +419,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; int ret; if (unlikely(!or)) @@ -218,29 +432,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); + _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); + _LLU(per_dev->offset), + _LLU(per_dev->length)); continue; /* we recovered */ } + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + struct ore_dev *od = ios->oc->ods[ + per_dev->dev - ios->oc->first_dev]; + + on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + offset, residual); + } if (osi.osd_err_pri >= acumulated_osd_err) { acumulated_osd_err = osi.osd_err_pri; acumulated_lin_err = ret; } } - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - return acumulated_lin_err; } EXPORT_SYMBOL(ore_check_io); @@ -248,61 +464,65 @@ EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file * - * U - The number of bytes in a stripe within a group + * D - number of Data devices + * D = group_width - parity * - * U = stripe_unit * group_width + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D * * T - The number of bytes striped within a group of component objects * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth + * T = U * group_depth * * S - The number of bytes striped across all component objects * before the pattern repeats + * S = T * group_count * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * + * M - The "major" (i.e., across all components) cycle number * M = L / S * - * G - Counts the groups from the beginning of the major stripe - * + * G - Counts the groups from the beginning of the major cycle * G = (L - (M * S)) / T [or (L % S) / T] * * H - The byte offset within the group - * * H = (L - (M * S)) % T [or (L % S) % T] * * N - The "minor" (i.e., across the group) stripe number - * * N = H / U * * C - The component index coresponding to L * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] * * O - The component offset coresponding to L - * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - u64 M; /* for truncate */ - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct _striping_info *si) +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; u64 group_depth = layout->group_depth; + u32 parity = layout->parity; - u32 U = stripe_unit * group_width; + u32 D = group_width - parity; + u32 U = D * stripe_unit; u64 T = U * group_depth; u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); @@ -318,39 +538,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, u32 N = div_u64(H, U); /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= layout->mirrors_p1; + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); - si->group_length = T - H; + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; si->M = M; } +EXPORT_SYMBOL(ore_calc_stripe_info); -static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct ore_per_dev_state *per_dev, - int cur_len) +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; + unsigned len = cur_len; + int ret; if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } @@ -358,64 +604,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; + if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + pgbase = 0; ++pg; } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; - return 0; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; } -static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct _striping_info *si) +static int _prepare_for_striping(struct ore_io_state *ios) { + struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned dev_order; unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; int ret = 0; + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; + while (length) { - struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; unsigned cur_len, page_off = 0; if (!per_dev->length) { per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); per_dev->offset = si->obj_offset; cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; cur_len = stripe_unit; } - - if (max_comp < dev) - max_comp = dev; } else { cur_len = stripe_unit; } if (cur_len >= length) cur_len = length; - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); if (unlikely(ret)) goto out; @@ -423,60 +695,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, dev = (dev % devs_in_group) + first_dev; length -= cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - ios->pages_consumed = cur_pg; - return ret; -} - -static int _prepare_for_striping(struct ore_io_state *ios) -{ - u64 length = ios->length; - u64 offset = ios->offset; - struct _striping_info si; - int ret = 0; - if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + } + if (ios->sp2d) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } - _calc_stripe_info(ios->layout, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - while (length) { - _calc_stripe_info(ios->layout, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; } - out: - return ret; + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; } int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -501,7 +773,7 @@ int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -543,7 +815,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } per_dev->or = or; - per_dev->offset = master_dev->offset; if (ios->pages) { struct bio *bio; @@ -562,6 +833,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; + per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; per_dev->bio = bio; per_dev->dev = dev; @@ -579,7 +851,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) @@ -588,7 +868,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) "length=0x%llx dev=%d\n", _LLU(_ios_obj(ios, dev)->id), _LLU(per_dev->offset), - _LLU(ios->length), dev); + _LLU(ios->length), per_dev->dev); } else { osd_req_set_attributes(or, _ios_obj(ios, dev)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", @@ -614,6 +894,14 @@ int ore_write(struct ore_io_state *ios) int i; int ret; + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + ret = _prepare_for_striping(ios); if (unlikely(ret)) return ret; @@ -629,7 +917,7 @@ int ore_write(struct ore_io_state *ios) } EXPORT_SYMBOL(ore_write); -static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; @@ -648,22 +936,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) per_dev->or = or; if (ios->pages) { - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(obj->id), + " dev=%d sg_len=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, obj, per_dev->offset, - ios->kern_buff, ios->length); - ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(obj->id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; + first_dev, per_dev->cur_sg); } else { + BUG_ON(ios->kern_buff); + osd_req_get_attributes(or, obj); ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(obj->id), @@ -688,7 +981,7 @@ int ore_read(struct ore_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _read_mirror(ios, i); + ret = _ore_read_mirror(ios, i); if (unlikely(ret)) return ret; } @@ -744,31 +1037,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, } struct _trunc_info { - struct _striping_info si; + struct ore_striping_info si; u64 prev_group_obj_off; u64 next_group_obj_off; unsigned first_group_dev; unsigned nex_group_dev; - unsigned max_devs; }; -void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) { unsigned stripe_unit = layout->stripe_unit; - _calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); ti->nex_group_dev = ti->first_group_dev + layout->group_width; - ti->max_devs = layout->group_width * layout->group_count; } -int ore_truncate(struct ore_layout *layout, struct ore_components *comps, +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, u64 size) { struct ore_io_state *ios; @@ -779,22 +1070,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, struct _trunc_info ti; int i, ret; - ret = ore_get_io_state(layout, comps, &ios); + ret = ore_get_io_state(layout, oc, &ios); if (unlikely(ret)) return ret; _calc_trunk_info(ios->layout, size, &ti); - size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; goto out; } - ios->numdevs = ios->comps->numdevs; + ios->numdevs = ios->oc->numdevs; - for (i = 0; i < ti.max_devs; ++i) { + for (i = 0; i < ios->numdevs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; @@ -815,7 +1106,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, size_attr->attr.val_ptr = &size_attr->newsize; ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(comps->comps->obj.id), _LLU(obj_size), i); + _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 00000000000..29c47e5c4a8 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,660 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <linux/gfp.h> +#include <linux/async_tx.h> + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_read_4_write(struct ore_io_state *ios, + struct ore_striping_info *si, struct page *page) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); + if (unlikely(added_len != PAGE_SIZE)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += PAGE_SIZE; + return 0; +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) + /* to-be-written pages start here */ + goto read_last_stripe; + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_read_4_write(ios, &read_si, *pp); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE * PAGE_SIZE; + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_read_4_write(ios, &read_si, page); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + BUG_ON(ios->offset % PAGE_SIZE); + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 00000000000..2ffd2c3c6e4 --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <scsi/osd_ore.h> + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 274894053b0..e6085ec192d 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,6 +35,7 @@ #include <linux/parser.h> #include <linux/vfs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> @@ -266,7 +267,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -321,7 +322,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -355,12 +356,12 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct ore_comp one_comp; - struct ore_components comps; + struct ore_components oc; struct ore_io_state *ios; int ret = -ENOMEM; @@ -378,9 +379,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) * the writeable info is set in exofs_sbi_write_stats() above. */ - exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - ret = ore_get_io_state(&sbi->layout, &comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -429,19 +430,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->comps.numdevs) { - int i = --sbi->comps.numdevs; - struct osd_dev *od = sbi->comps.ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->comps.ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } - if (sbi->comps.ods != sbi->_min_one_dev) - kfree(sbi->comps.ods); + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +470,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -479,76 +481,20 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); - return -EINVAL; - } - - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; - } else { - if (sbi->data_map.odm_group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; - } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); EXOFS_DBGMSG("exofs: layout: " "num_comps=%u stripe_unit=0x%x group_width=%u " @@ -558,8 +504,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.group_width, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, - sbi->data_map.odm_raid_algorithm); - return 0; + sbi->layout.raid_algorithm); + return ret; } static unsigned __ra_pages(struct ore_layout *layout) @@ -605,12 +551,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, struct osd_dev *fscb_od, unsigned table_count) { struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -623,7 +597,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, return -ENOMEM; } - sbi->comps.numdevs = 0; + sbi->oc.numdevs = 0; comp.obj.partition = sbi->one_comp.obj.partition; comp.obj.id = EXOFS_DEVTABLE_ID; @@ -647,20 +621,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->comps.ods[0]); - - /* Twice bigger table: See exofs_init_comps() and below - * comment - */ - sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->comps.ods)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", - numdevs); - ret = -ENOMEM; - goto out; - } - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -676,13 +646,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->comps.ods[i] = fscb_od; - ++sbi->comps.numdevs; + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -695,8 +668,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->comps.ods[i] = od; - ++sbi->comps.numdevs; + eds[i].ored.od = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -718,21 +691,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); - if (likely(!ret)) { - unsigned numdevs = sbi->comps.numdevs; - - if (unlikely(fscb_od)) { + if (unlikely(fscb_od && !ret)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); osduld_put_device(fscb_od); return -EINVAL; - } - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - for (i = 0; i < numdevs - 1; ++i) - sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; } return ret; } @@ -783,10 +745,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->comps.numdevs = 1; - sbi->comps.single_comp = EC_SINGLE_COMP; - sbi->comps.comps = &sbi->one_comp; - sbi->comps.ods = sbi->_min_one_dev; + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -835,7 +796,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->comps.ods[0] = od; + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -875,7 +842,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); return 0; @@ -924,7 +892,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint64_t used = ULLONG_MAX; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; @@ -981,7 +949,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 8f44cef1b3e..a8cbe1bc6ad 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv) void ext2_init_block_alloc_info(struct inode *inode) { struct ext2_inode_info *ei = EXT2_I(inode); - struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index af9fc89b1b2..9a4e5e206d0 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ -extern void ext2_error (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext2_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext2_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ee9ed31948e..c4e81dfb74b 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -601,7 +601,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a8a58f63f07..91a6945af6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1321,7 +1321,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1dd62ed35b8..bd8ac164a3b 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb, if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) return ERR_PTR(-ESTALE); - /* iget isn't really right if the inode is currently unallocated!! - * ext2_read_inode currently does appropriate checks, but - * it might be "neater" to call ext2_get_inode first and check - * if the inode is valid..... + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. */ inode = ext2_iget(sb, ino); if (IS_ERR(inode)) diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 5d979b4347b..c922adc8ef4 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int -ext2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 6386d76f44a..a2038928f9a 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); @@ -1440,14 +1440,14 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && + !use_reservation && sbi->s_resuid != current_fsuid() && (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { return 0; } @@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } @@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) * reaches any used block. Then issue a TRIM command on this extent and free * the extent in the block bitmap. This is done until whole group is scanned. */ -ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) { handle_t *handle; ext3_grpblk_t next, free_blocks, bit, freed, count = 0; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d494c554c6e..1860ed35632 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - /* - * Taking the mutex here just to keep consistent with how fsync was - * called previously, however it looks like we don't need to take - * i_mutex at all. - */ - mutex_lock(&inode->i_mutex); - J_ASSERT(ext3_journal_current_handle() == NULL); /* @@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * safe in-journal, which is all fsync() needs to ensure. */ if (ext3_should_journal_data(inode)) { - mutex_unlock(&inode->i_mutex); ret = ext3_force_commit(inode->i_sb); goto out; } @@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - - mutex_unlock(&inode->i_mutex); out: trace_ext3_sync_file_exit(inode, ret); return ret; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bf09cbf938c..5c866e06e7a 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -178,42 +178,6 @@ error_return: } /* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - -/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; @@ -621,7 +582,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 12661e1deed..85fe655fe3e 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2899,7 +2899,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index c7f43944f16..ba1b54e23ca 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -150,30 +150,6 @@ setversion_out: mnt_drop_write(filp->f_path.mnt); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0629e09f651..642dc6d66df 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1821,7 +1821,7 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, dir_block); if (err) @@ -1833,7 +1833,7 @@ retry: if (err) { out_clear_inode: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); @@ -2170,7 +2170,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7beb69ae001..922d289aeeb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: @@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index b8d9f83aa5c..3c218b8a51d 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f8224adf496..12ccacda44e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -28,7 +28,8 @@ */ /* - * Calculate the block group number and offset, given a block number + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) @@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) @@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, return 0; } -static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { - ext4_fsblk_t tmp; + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* block bitmap, inode bitmap, and inode table blocks */ - int used_blocks = sbi->s_itb_per_group + 2; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), - block_group)) - used_blocks--; - - if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), - block_group)) - used_blocks--; - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!ext4_block_in_group(sb, tmp, block_group)) - used_blocks -= 1; + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, (start - + ext4_block_bitmap(sb, gdp))); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; } } - return used_blocks; -} -/* Initializes an uninitialized block bitmap if given, and returns the - * number of blocks free in the group. */ -unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, struct ext4_group_desc *gdp) -{ - int bit, bit_max; - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned free_blocks, group_blocks; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (bh) { - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", - block_group); - ext4_free_blks_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return 0; + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + start - ext4_inode_bitmap(sb, gdp)); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; } - memset(bh->b_data, 0, sb->s_blocksize); } - /* Check for superblock and gdt backups in this group */ - bit_max = ext4_bg_has_super(sb, block_group); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * - sbi->s_desc_per_block) { - if (bit_max) { - bit_max += ext4_bg_num_gdb(sb, block_group); - bit_max += - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, start - itbl_blk + i); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; } - } else { /* For META_BG_BLOCK_GROUPS */ - bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == ngroups - 1) { + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { /* - * Even though mke2fs always initialize first and last group - * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need - * to make sure we calculate the right free blocks + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. */ - group_blocks = ext4_blocks_count(sbi->s_es) - - ext4_group_first_block_no(sb, ngroups - 1); - } else { - group_blocks = EXT4_BLOCKS_PER_GROUP(sb); - } + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} - free_blocks = group_blocks - bit_max; +/* Initializes an uninitialized block bitmap */ +void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + J_ASSERT_BH( |