From 48fec5d0a504dfbb302cb1dd24ebb0b82a46cce9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 1 Jul 2015 16:27:46 +0800
Subject: ceph: EIO all operations after forced umount

This patch makes try_get_cap_refs() and __do_request() check
if the file system was forced umount, and return -EIO if it was.
This patch also adds a helper function to drops dirty caps and
wakes up blocking operation.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c       |  2 +-
 fs/ceph/caps.c       |  8 ++++++++
 fs/ceph/mds_client.c | 54 +++++++++++++++++++++++++++++++++++++++++-----------
 fs/ceph/mds_client.h |  1 +
 fs/ceph/super.c      |  1 +
 5 files changed, 54 insertions(+), 12 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c50971a69..1594f2c590bd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -717,7 +717,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
 		pr_warn("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e9471290..27b566874bc1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2413,6 +2413,14 @@ again:
 			goto out_unlock;
 		}
 
+		if (!__ceph_is_any_caps(ci) &&
+		    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			dout("get_cap_refs %p forced umount\n", inode);
+			*err = -EIO;
+			ret = 1;
+			goto out_unlock;
+		}
+
 		dout("get_cap_refs %p have %s needed %s\n", inode,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af67603..ddc471c32230 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
-		complete_request(mdsc, req);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *session = NULL;
 	int mds = -1;
-	int err = -EAGAIN;
+	int err = 0;
 
 	if (req->r_err || req->r_got_result) {
 		if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 		err = -EIO;
 		goto finish;
 	}
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		dout("do_request forced umount\n");
+		err = -EIO;
+		goto finish;
+	}
 
 	put_request_session(req);
 
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 out_session:
 	ceph_put_mds_session(session);
+finish:
+	if (err) {
+		dout("__do_request early error %d\n", err);
+		req->r_err = err;
+		complete_request(mdsc, req);
+		__unregister_request(mdsc, req);
+	}
 out:
 	return err;
-
-finish:
-	req->r_err = err;
-	complete_request(mdsc, req);
-	goto out;
 }
 
 /*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
 	if (req->r_err) {
 		err = req->r_err;
-		__unregister_request(mdsc, req);
-		dout("do_request early error %d\n", err);
 		goto out;
 	}
 
@@ -3555,7 +3559,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush, want_snap;
 
-	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
@@ -3584,7 +3588,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
-	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
 		return true;
 	return atomic_read(&mdsc->num_sessions) == 0;
 }
@@ -3643,6 +3647,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int mds;
+
+	dout("force umount\n");
+
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; mds < mdsc->max_sessions; mds++) {
+		session = __ceph_lookup_mds_session(mdsc, mds);
+		if (!session)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+			cleanup_session_requests(mdsc, session);
+			remove_session_caps(session);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+		mutex_lock(&mdsc->mutex);
+		kick_requests(mdsc, mds);
+	}
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+	mutex_unlock(&mdsc->mutex);
+}
+
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e6cebf..f575eafe2261 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..69e6e19bcfc1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	ceph_mdsc_force_umount(fsc->mdsc);
 	return;
 }
 
-- 
cgit v1.2.3


From a341d4df87487ae68189e0be869c39a2b0cb9aaa Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 1 Jul 2015 17:03:23 +0800
Subject: ceph: invalidate dirty pages after forced umount

After forced umount, ceph_writepages_start() skips flushing dirty
pages. To make sure inode's reference count get dropped to zero,
we need to invalidate dirty pages.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1594f2c590bd..98933350331c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -719,6 +719,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 
 	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
 		pr_warn("writepage_start %p on forced umount\n", inode);
+		truncate_pagecache(inode, 0);
+		mapping_set_error(mapping, -EIO);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
 	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
-- 
cgit v1.2.3


From 23078637e05460428f803be7d0f46908df8a970a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 20 Jul 2015 10:14:06 +0800
Subject: ceph: fix queuing inode to mdsdir's snaprealm

During MDS failovers, MClientSnap message may cause kclient to move
some inodes from root directory's snaprealm to mdsdir's snaprealm
and queue snapshots for these inodes. For a FS has never created any
snapshot, both root directory's snaprealm and mdsdir's snaprealm
share the same snapshot contexts (both are ceph_empty_snapc). This
confuses ceph_put_wrbuffer_cap_refs(), make it unable to distinguish
snapshot buffers from head buffers.

The fix is do not use ceph_empty_snapc as snaprealm's cached context.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/snap.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906aec02..4aa7122a8d38 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 		return 0;
 	}
 
-	if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
-		ceph_get_snap_context(ceph_empty_snapc);
-		snapc = ceph_empty_snapc;
-		goto done;
-	}
-
 	/* alloc new snap context */
 	err = -ENOMEM;
 	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	     realm->ino, realm, snapc, snapc->seq,
 	     (unsigned int) snapc->num_snaps);
 
-done:
 	ceph_put_snap_context(realm->cached_context);
 	realm->cached_context = snapc;
 	return 0;
-- 
cgit v1.2.3


From 1550d34e5626a20a2e12c73bdc1e6e217a0ba897 Mon Sep 17 00:00:00 2001
From: Brad Hubbard <bhubbard@redhat.com>
Date: Tue, 18 Aug 2015 10:18:53 +0800
Subject: ceph: remove redundant test of head->safe and silence static analysis
 warnings

Signed-off-by: Brad Hubbard <bhubbard@redhat.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ddc471c32230..d4eaa849a820 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2415,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
-	if (req->r_got_safe && !head->safe) {
+	if (req->r_got_safe) {
 		pr_warn("got unsafe after safe on %llu from mds%d\n",
 			   tid, mds);
 		mutex_unlock(&mdsc->mutex);
-- 
cgit v1.2.3


From a43137f7b0f1467cf3005b6ff6574d978642d247 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:23:50 +0800
Subject: ceph: remove the useless judgement

err != 0 is already handled. So skip this.

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87eaf46..2ebcbd4c163b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -279,7 +279,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto out_req;
 
-	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
 
 	if (d_unhashed(dentry)) {
-- 
cgit v1.2.3


From e36d571d70c7f46b20c28d81025fd5fc044a8e22 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:25:35 +0800
Subject: ceph: no need to get parent inode in ceph_open

parent inode is needed in creating new inode case.  For ceph_open,
the target inode already exists.

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/file.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ebcbd4c163b..90ec110b8400 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *parent_inode = NULL;
 	int err;
 	int flags, fmode, wanted;
 
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	ihold(inode);
 
 	req->r_num_caps = 1;
-	if (flags & O_CREAT)
-		parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
-	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-	iput(parent_inode);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
-- 
cgit v1.2.3


From 5fdb1389e1399d6801a8c5d10952ef4153039fb2 Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <jianpeng.ma@intel.com>
Date: Tue, 18 Aug 2015 10:30:38 +0800
Subject: ceph: cleanup use of ceph_msg_get

Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d4eaa849a820..51cb02da75d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2524,8 +2524,7 @@ out_err:
 		if (err) {
 			req->r_err = err;
 		} else {
-			req->r_reply = msg;
-			ceph_msg_get(msg);
+			req->r_reply =  ceph_msg_get(msg);
 			req->r_got_result = true;
 		}
 	} else {
-- 
cgit v1.2.3


From 55b0b31cbc09f80db384671e22cdc94b2aa26b29 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 7 Sep 2015 11:35:01 +0800
Subject: ceph: get inode size for each append write

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/file.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 90ec110b8400..0c62868b5c56 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -952,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 
+	if (iocb->ki_flags & IOCB_APPEND) {
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+		if (err < 0)
+			goto out;
+	}
+
 	err = generic_write_checks(iocb, from);
 	if (err <= 0)
 		goto out;
-- 
cgit v1.2.3


From 438386853d4c0c48fe73bf05a7d61c70ca5a3bfb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 7 Sep 2015 15:46:24 +0800
Subject: ceph: improve readahead for file holes

When readahead encounters file holes, osd reply returns error -ENOENT,
finish_read() skips adding pages to the the page cache. So readahead
does not work for file holes. The fix is adding zero pages to the
page cache when -ENOENT is returned.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 98933350331c..6471e28e0586 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	for (i = 0; i < num_pages; i++) {
 		struct page *page = osd_data->pages[i];
 
-		if (rc < 0)
+		if (rc < 0 && rc != ENOENT)
 			goto unlock;
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
-- 
cgit v1.2.3