From a0dff78dab0ff8d78bd5c9e33c105cf1292f2282 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Jul 2010 13:47:21 -0700 Subject: ceph: avoid dcache readdir for snapdir We should always go to the MDS for readdir on the hidden snapdir. The set of snapshots can change at any time; the client can't trust its cache for that. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f85719310db2..ea36ba9960d3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_opt(client, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { err = __dcache_readdir(filp, dirent, filldir); -- cgit v1.2.3 From 252af5214682191e34e57204e1a31924fb82c207 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Jul 2010 13:49:08 -0700 Subject: ceph: fix d_release dop for snapdir, snapped dentries We need to set the d_release dop for snapdir and snapped dentries so that the ceph_dentry_info struct gets released. We also use the dcache to cache readdir results when possible, which only works if we know when dentries are dropped from the cache. Since we don't use the dcache for readdir in the hidden snapdir, avoid that case in ceph_dentry_release. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ea36ba9960d3..f94ed3c7f6a5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1014,18 +1014,22 @@ out_touch: /* * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * of the current dir gen or if this is in the snapshot namespace. */ static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *parent_inode = dentry->d_parent->d_inode; + u64 snapid = ceph_snap(parent_inode); - if (parent_inode) { + dout("dentry_release %p parent %p\n", dentry, parent_inode); + + if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { + if (ci->i_shared_gen == di->lease_shared_gen || + snapid <= CEPH_MAXSNAP) { dout(" clearing %p complete (d_release)\n", parent_inode); ci->i_ceph_flags &= ~CEPH_I_COMPLETE; @@ -1242,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = { struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_dentry_release, }; struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_dentry_release, }; -- cgit v1.2.3 From bc4fdca85734d12cd2c7a25c52323ef6e6e5adef Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Jul 2010 16:19:56 -0700 Subject: ceph: fix pg_mapping leak on pg_temp updates Free the ceph_pg_mapping structs when they are removed from the pg_temp rbtree. Also fix a leak in the __insert_pg_mapping() error path. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 277f8b339577..416d46adbf87 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -831,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* remove any? */ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } if (pglen) { @@ -852,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) + if (err) { + kfree(pg); goto bad; + } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); } } while (rbp) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } /* ignore the rest */ -- cgit v1.2.3 From 8c696737aa61316a252c4514d09dd163f1464d33 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Jul 2010 14:11:56 -0700 Subject: ceph: fix leak of dentry in ceph_init_dentry() error path If we fail to allocate a ceph_dentry_info, don't leak the dn reference. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f9b9fe8ef9f..3582e79f46e0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1199,8 +1199,10 @@ retry_lookup: goto out; } err = ceph_init_dentry(dn); - if (err < 0) + if (err < 0) { + dput(dn); goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { -- cgit v1.2.3 From 1dadcce358a4c4078e1ea0bc4365c3f67b8e373e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 23 Jul 2010 13:54:21 -0700 Subject: ceph: fix dentry lease release When we embed a dentry lease release notification in a request, invalidate our lease so we don't think we still have it. Otherwise we can get all sorts of incorrect client behavior when multiple clients are interacting with the same part of the namespace. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 74144d6389f0..6afc1affb50a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2984,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; -- cgit v1.2.3 From 25848b3ec681c7018e3746dd850c1e8ed0a3dd6b Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 24 Jul 2010 06:41:18 -0400 Subject: ceph: Correct obvious typo of Kconfig variable "CRYPTO_AES" Signed-off-by: Robert P. J. Day Signed-off-by: Sage Weil --- fs/ceph/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280582a9..bc87b9c1d27e 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely -- cgit v1.2.3 From 96d6523adffbab64f099561a021892125e0c672c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 8 Jul 2010 09:31:24 -0700 Subject: sysfs: Don't allow the creation of symlinks we can't remove Recently my tagged sysfs support revealed a flaw in the device core that a few rare drivers are running into such that we don't always put network devices in a class subdirectory named net/. Since we are not creating the class directory the network devices wind up in a non-tagged directory, but the symlinks to the network devices from /sys/class/net are in a tagged directory. All of which works until we go to remove or rename the symlink. When we remove or rename a symlink we look in the namespace of the target of the symlink. Since the target of the symlink is in a non-tagged sysfs directory we don't have a namespace to look in, and we fail to remove the symlink. Detect this problem up front and simply don't create symlinks we won't be able to remove later. This prevents symlink leakage and fails in a much clearer and more understandable way. Signed-off-by: Eric W. Biederman Cc: Andrew Morton Cc: Rafael J. Wysocki Cc: Maciej W. Rozycki Cc: Kay Sievers Cc: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/symlink.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f71246bebfe4..44bca5f49cd5 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; int error; BUG_ON(!name); @@ -58,16 +59,28 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; - if (sysfs_ns_type(parent_sd)) + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); + /* Symlinks must be between directories with the same ns_type */ + if (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent)) { + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } sysfs_addrm_finish(&acxt); if (error) -- cgit v1.2.3 From 521d0453547d6195d200176328aaec6c98a7a290 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 20 Jul 2010 22:10:58 -0700 Subject: sysfs: sysfs_delete_link handle symlinks from untagged to tagged directories. This happens for network devices when SYSFS_DEPRECATED is enabled. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 44bca5f49cd5..660383321347 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -135,7 +135,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, { const void *ns = NULL; spin_lock(&sysfs_assoc_lock); - if (targ->sd) + if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; spin_unlock(&sysfs_assoc_lock); sysfs_hash_and_remove(kobj->sd, ns, name); -- cgit v1.2.3 From d33002129eee4717a92e320b0b764a784bbcad3a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 20 Jul 2010 22:12:01 -0700 Subject: sysfs: allow creating symlinks from untagged to tagged directories Supporting symlinks from untagged to tagged directories is reasonable, and needed to support CONFIG_SYSFS_DEPRECATED. So don't fail a prior allowing that case to work. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/symlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 660383321347..a7ac78f8e67a 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -67,7 +67,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, sysfs_addrm_start(&acxt, parent_sd); /* Symlinks must be between directories with the same ns_type */ - if (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent)) { + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { if (warn) error = sysfs_add_one(&acxt, sd); else -- cgit v1.2.3 From da7ddd3296505b4cb46685e1bbf7d0075b3cd4f1 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Mon, 19 Jul 2010 15:40:03 -0500 Subject: 9p: Pass the correct end of buffer to p9stat_read Pass the correct end of the buffer to p9stat_read. Signed-off-by: Latchesar Ionkov Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index d61e3b28ce37..36d961f342af 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -146,7 +146,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, - buflen - rdir->head, &st, + rdir->tail - rdir->head, &st, fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); -- cgit v1.2.3 From 03066f23452ff088ad8e2c8acdf4443043f35b51 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Tue, 27 Jul 2010 13:11:08 -0700 Subject: ceph: use complete_all and wake_up_all This fixes an issue triggered by running concurrent syncs. One of the syncs would go through while the other would just hang indefinitely. In any case, we never actually want to wake a single waiter, so the *_all functions should be used. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/caps.c | 14 +++++++------- fs/ceph/file.c | 2 +- fs/ceph/inode.c | 2 +- fs/ceph/mds_client.c | 10 +++++----- fs/ceph/mon_client.c | 6 +++--- fs/ceph/osd_client.c | 6 +++--- 6 files changed, 20 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6afc1affb50a..b81be9a56487 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -627,7 +627,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1181,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -2153,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2229,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); @@ -2405,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (queue_invalidate) ceph_queue_invalidate(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2460,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { @@ -2472,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: spin_unlock(&inode->i_lock); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6251a1574b94..7c08698fad3e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3582e79f46e0..389f9dbd9949 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1501,7 +1501,7 @@ retry: if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 416c08d315db..dd440bd438a9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&inode->i_lock); ci->i_wanted_max_size = 0; @@ -1564,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1932,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); + complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1947,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -2126,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + complete_all(&mdsc->session_close_waiters); kick_requests(mdsc, mds); break; diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index cc115eafae11..54fe01c50706 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); + wake_up_all(&client->auth_wq); } /* @@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, } mutex_unlock(&monc->mutex); if (req) { - complete(&req->completion); + complete_all(&req->completion); put_generic_request(req); } return; @@ -718,7 +718,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->m_auth->front_max); if (ret < 0) { monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); + wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 92b7251a53f1..e38522347898 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_callback) req->r_callback(req, msg); else - complete(&req->r_completion); + complete_all(&req->r_completion); if (flags & CEPH_OSD_FLAG_ONDISK) { if (req->r_safe_callback) req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ + complete_all(&req->r_safe_completion); /* fsync waiter */ } done: @@ -1083,7 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); + wake_up_all(&osdc->client->auth_wq); return; bad: -- cgit v1.2.3 From d2a97a4e99ff0ffdccd1fc46f22fb34270ef1e56 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 28 Jul 2010 17:56:23 +0100 Subject: GFS2: Use kmalloc when possible for ->readdir() If we don't need a huge amount of memory in ->readdir() then we can use kmalloc rather than vmalloc to allocate it. This should cut down on the greater overheads associated with vmalloc for smaller directories. We may be able to eliminate vmalloc entirely at some stage, but this is easy to do right away. Also using GFP_NOFS to avoid any issues wrt to deleting inodes while under a glock, and suggestion from Linus to factor out the alloc/dealloc. I've given this a test with a variety of different sized directories and it seems to work ok. Cc: Andrew Morton Cc: Nick Piggin Cc: Prarit Bhargava Signed-off-by: Steven Whitehouse Signed-off-by: Linus Torvalds --- fs/gfs2/dir.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 26ca3361a8bc..6b48d7c268b2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1231,6 +1231,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, return 0; } +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir, int *copied, unsigned *depth, u64 leaf_no) @@ -1271,7 +1290,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1282,7 +1301,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1291,7 +1310,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1300,7 +1319,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1312,10 +1331,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, BUG_ON(entries2 != entries); error = do_filldir_main(ip, offset, opaque, filldir, darr, entries, copied); -out_kfree: +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } -- cgit v1.2.3 From a6f80fb7b5986fda663d94079d3bba0937a6b6ff Mon Sep 17 00:00:00 2001 From: Andre Osterhues Date: Tue, 13 Jul 2010 15:59:17 -0500 Subject: ecryptfs: Bugfix for error related to ecryptfs_hash_buckets The function ecryptfs_uid_hash wrongly assumes that the second parameter to hash_long() is the number of hash buckets instead of the number of hash bits. This patch fixes that and renames the variable ecryptfs_hash_buckets to ecryptfs_hash_bits to make it clearer. Fixes: CVE-2010-2492 Signed-off-by: Andre Osterhues Signed-off-by: Tyler Hicks Signed-off-by: Linus Torvalds --- fs/ecryptfs/messaging.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 2d8dbce9d485..46c4dd8dfcc3 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; -static int ecryptfs_hash_buckets; +static int ecryptfs_hash_bits; #define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_buckets) + hash_long((unsigned long)uid, ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void) } mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); - ecryptfs_hash_buckets = 1; - while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) - ecryptfs_hash_buckets++; + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) - * ecryptfs_hash_buckets), GFP_KERNEL); + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } - for (i = 0; i < ecryptfs_hash_buckets; i++) + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) @@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void) int i; mutex_lock(&ecryptfs_daemon_hash_mux); - for (i = 0; i < ecryptfs_hash_buckets; i++) { + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; hlist_for_each_entry(daemon, elem, -- cgit v1.2.3 From de09a9771a5346029f4d11e4ac886be7f9bfdd75 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Jul 2010 12:45:49 +0100 Subject: CRED: Fix get_task_cred() and task_state() to not resurrect dead credentials It's possible for get_task_cred() as it currently stands to 'corrupt' a set of credentials by incrementing their usage count after their replacement by the task being accessed. What happens is that get_task_cred() can race with commit_creds(): TASK_1 TASK_2 RCU_CLEANER -->get_task_cred(TASK_2) rcu_read_lock() __cred = __task_cred(TASK_2) -->commit_creds() old_cred = TASK_2->real_cred TASK_2->real_cred = ... put_cred(old_cred) call_rcu(old_cred) [__cred->usage == 0] get_cred(__cred) [__cred->usage == 1] rcu_read_unlock() -->put_cred_rcu() [__cred->usage == 1] panic() However, since a tasks credentials are generally not changed very often, we can reasonably make use of a loop involving reading the creds pointer and using atomic_inc_not_zero() to attempt to increment it if it hasn't already hit zero. If successful, we can safely return the credentials in the knowledge that, even if the task we're accessing has released them, they haven't gone to the RCU cleanup code. We then change task_state() in procfs to use get_task_cred() rather than calling get_cred() on the result of __task_cred(), as that suffers from the same problem. Without this change, a BUG_ON in __put_cred() or in put_cred_rcu() can be tripped when it is noticed that the usage count is not zero as it ought to be, for example: kernel BUG at kernel/cred.c:168! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/kernel/mm/ksm/run CPU 0 Pid: 2436, comm: master Not tainted 2.6.33.3-85.fc13.x86_64 #1 0HR330/OptiPlex 745 RIP: 0010:[] [] __put_cred+0xc/0x45 RSP: 0018:ffff88019e7e9eb8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff880161514480 RCX: 00000000ffffffff RDX: 00000000ffffffff RSI: ffff880140c690c0 RDI: ffff880140c690c0 RBP: ffff88019e7e9eb8 R08: 00000000000000d0 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000040 R12: ffff880140c690c0 R13: ffff88019e77aea0 R14: 00007fff336b0a5c R15: 0000000000000001 FS: 00007f12f50d97c0(0000) GS:ffff880007400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8f461bc000 CR3: 00000001b26ce000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process master (pid: 2436, threadinfo ffff88019e7e8000, task ffff88019e77aea0) Stack: ffff88019e7e9ec8 ffffffff810698cd ffff88019e7e9ef8 ffffffff81069b45 <0> ffff880161514180 ffff880161514480 ffff880161514180 0000000000000000 <0> ffff88019e7e9f28 ffffffff8106aace 0000000000000001 0000000000000246 Call Trace: [] put_cred+0x13/0x15 [] commit_creds+0x16b/0x175 [] set_current_groups+0x47/0x4e [] sys_setgroups+0xf6/0x105 [] system_call_fastpath+0x16/0x1b Code: 48 8d 71 ff e8 7e 4e 15 00 85 c0 78 0b 8b 75 ec 48 89 df e8 ef 4a 15 00 48 83 c4 18 5b c9 c3 55 8b 07 8b 07 48 89 e5 85 c0 74 04 <0f> 0b eb fe 65 48 8b 04 25 00 cc 00 00 48 3b b8 58 04 00 00 75 RIP [] __put_cred+0xc/0x45 RSP ---[ end trace df391256a100ebdd ]--- Signed-off-by: David Howells Acked-by: Jiri Olsa Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b58d38bc911..fff6572676ae 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" -- cgit v1.2.3 From 674b2222920012244ca59978b356b25412a8dcc7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 13 Jul 2010 13:34:59 +0200 Subject: nfs: include space for the NUL in root path In root_nfs_name() it does the following: if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); return -1; } sprintf(nfs_export_path, buf, cp); In the original code if (strlen(buf) + strlen(cp) == NFS_MAXPATHLEN) then the sprintf() would lead to an overflow. Generally the rest of the code assumes that the path can have NFS_MAXPATHLEN (1024) characters and a NUL terminator so the fix is to add space to the nfs_export_path[] buffer. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 6bd19d843af7..df101d9f546a 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = ""; static __be32 servaddr __initdata = 0; /* Name of directory to mount */ -static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, }; +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; /* NFS-related data */ static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ -- cgit v1.2.3 From b608b283a962caaa280756bc8563016a71712acf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Jul 2010 15:31:54 -0400 Subject: NFS: kswapd must not block in nfs_release_page See https://bugzilla.kernel.org/show_bug.cgi?id=16056 If other processes are blocked waiting for kswapd to free up some memory so that they can make progress, then we cannot allow kswapd to block on those processes. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/file.c | 13 +++++++++++-- fs/nfs/write.c | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 36a5e74f51b4..f036153d9f50 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -493,11 +494,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if ((gfp & GFP_KERNEL) == GFP_KERNEL) - nfs_wb_page(page->mapping->host, page); + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 91679e2631ee..0a6c65a1f9d7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1379,7 +1379,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; @@ -1443,7 +1443,7 @@ out_mark_dirty: return ret; } #else -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { return 0; } -- cgit v1.2.3 From cfb506e1d330387dfaf334dd493b3773d388863d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Jul 2010 15:31:57 -0400 Subject: NFS: Ensure that writepage respects the nonblock flag Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a6c65a1f9d7..bb72ad34d51d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) { struct inode *inode = page->mapping->host; struct nfs_page *req; @@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * request as dirty (in which case we don't care). */ spin_unlock(&inode->i_lock); - ret = nfs_wait_on_request(req); + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; nfs_release_request(req); if (ret != 0) return ERR_PTR(ret); @@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -283,12 +286,20 @@ out: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { struct inode *inode = page->mapping->host; + int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - return nfs_page_async_flush(pgio, page); + ret = nfs_page_async_flush(pgio, page, + wbc->sync_mode == WB_SYNC_NONE || + wbc->nonblocking != 0); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; } /* @@ -1546,7 +1557,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, false); ret = PTR_ERR(req); if (IS_ERR(req)) goto out; -- cgit v1.2.3 From 51c20fcced5badee0e2021c6c89f44aa3cbd72aa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Jul 2010 15:25:19 +0100 Subject: CIFS: Remove __exit mark from cifs_exit_dns_resolver() Remove the __exit mark from cifs_exit_dns_resolver() as it's called by the module init routine in case of error, and so may have been discarded during linkage. Signed-off-by: David Howells Acked-by: Jeff Layton Signed-off-by: Linus Torvalds --- fs/cifs/dns_resolve.c | 2 +- fs/cifs/dns_resolve.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 49315cbf742d..853a968e82d7 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -227,7 +227,7 @@ failed_put_cred: return ret; } -void __exit cifs_exit_dns_resolver(void) +void cifs_exit_dns_resolver(void) { key_revoke(dns_resolver_cache->thread_keyring); unregister_key_type(&key_type_dns_resolver); diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 26b9eaa9f5ee..5d7f291df162 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -25,7 +25,7 @@ #ifdef __KERNEL__ extern int __init cifs_init_dns_resolver(void); -extern void __exit cifs_exit_dns_resolver(void); +extern void cifs_exit_dns_resolver(void); extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ -- cgit v1.2.3 From 77a63f3d1e0a3e7ede8d10f569e8481b13ff47c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Aug 2010 13:40:40 -0400 Subject: NFS: Fix a typo in include/linux/nfs_fs.h nfs_commit_inode() needs to be defined irrespectively of whether or not we are supporting NFSv3 and NFSv4. Allow the compiler to optimise away code in the NFSv2-only case by converting it into an inlined stub function. Reported-and-tested-by: Ingo Molnar Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/write.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb72ad34d51d..9f81bdd91c55 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1454,11 +1454,6 @@ out_mark_dirty: return ret; } #else -int nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} - static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { return 0; -- cgit v1.2.3