From 684999149002dd046269666a390458e0acb38280 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 6 Feb 2009 13:52:43 -0700
Subject: Rename struct file->f_ep_lock

This lock moves out of the CONFIG_EPOLL ifdef and becomes f_lock.  For now,
epoll remains the only user, but a future patch will use it to protect
f_flags as well.

Cc: Davide Libenzi <davidel@xmailserver.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/eventpoll.c  | 12 +++++++-----
 fs/file_table.c |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 011b9b8c90c6..c5c424f23fd5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -417,10 +417,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	ep_unregister_pollwait(ep, epi);
 
 	/* Remove the current item from the list of epoll hooks */
-	spin_lock(&file->f_ep_lock);
+	spin_lock(&file->f_lock);
 	if (ep_is_linked(&epi->fllink))
 		list_del_init(&epi->fllink);
-	spin_unlock(&file->f_ep_lock);
+	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
 
@@ -538,7 +538,7 @@ void eventpoll_release_file(struct file *file)
 	struct epitem *epi;
 
 	/*
-	 * We don't want to get "file->f_ep_lock" because it is not
+	 * We don't want to get "file->f_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
 	 * cleanup path, and this means that noone is using this file anymore.
 	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
@@ -547,6 +547,8 @@ void eventpoll_release_file(struct file *file)
 	 * will correctly serialize the operation. We do need to acquire
 	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
 	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
 
@@ -785,9 +787,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto error_unregister;
 
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_ep_lock);
+	spin_lock(&tfile->f_lock);
 	list_add_tail(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_ep_lock);
+	spin_unlock(&tfile->f_lock);
 
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6efa1a..aa1e18050282 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -127,6 +127,7 @@ struct file *get_empty_filp(void)
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_cred = get_cred(cred);
+	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
-- 
cgit v1.2.3


From db1dd4d376134eba0e08af523b61cc566a4ea1cd Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 6 Feb 2009 15:25:24 -0700
Subject: Use f_lock to protect f_flags

Traditionally, changes to struct file->f_flags have been done under BKL
protection, or with no protection at all.  This patch causes all f_flags
changes after file open/creation time to be done under protection of
f_lock.  This allows the removal of some BKL usage and fixes a number of
longstanding (if microscopic) races.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fcntl.c    | 2 ++
 fs/ioctl.c    | 7 ++++---
 fs/nfsd/vfs.c | 5 ++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc791da..04df8570a2d2 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -189,7 +189,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		}
 	}
 
+	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
  out:
 	unlock_kernel();
 	return error;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec63984cb..421aab465dab 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
 	if (O_NONBLOCK != O_NDELAY)
 		flag |= O_NDELAY;
 #endif
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= flag;
 	else
 		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -432,10 +434,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	if (error)
 		return error;
 
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= FASYNC;
 	else
 		filp->f_flags &= ~FASYNC;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -499,10 +503,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOASYNC:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa56ca2..c165a6403df0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (!EX_ISSYNC(exp))
 		stable = 0;
-	if (stable && !EX_WGATHER(exp))
+	if (stable && !EX_WGATHER(exp)) {
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_SYNC;
+		spin_unlock(&file->f_lock);
+	}
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-- 
cgit v1.2.3


From 76398425bb06b07cc3a3b1ce169c67dc9d6874ed Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 1 Feb 2009 14:26:59 -0700
Subject: Move FASYNC bit handling to f_op->fasync()

Removing the BKL from FASYNC handling ran into the challenge of keeping the
setting of the FASYNC bit in filp->f_flags atomic with regard to calls to
the underlying fasync() function.  Andi Kleen suggested moving the handling
of that bit into fasync(); this patch does exactly that.  As a result, we
have a couple of internal API changes: fasync() must now manage the FASYNC
bit, and it will be called without the BKL held.

As it happens, every fasync() implementation in the kernel with one
exception calls fasync_helper().  So, if we make fasync_helper() set the
FASYNC bit, we can avoid making any changes to the other fasync()
functions - as long as those functions, themselves, have proper locking.
Most fasync() implementations do nothing but call fasync_helper() - which
has its own lock - so they are easily verified as correct.  The BKL had
already been pushed down into the rest.

The networking code has its own version of fasync_helper(), so that code
has been augmented with explicit FASYNC bit handling.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Miller <davem@davemloft.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fcntl.c | 29 ++++++++++++++++-------------
 fs/ioctl.c | 13 +------------
 2 files changed, 17 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 04df8570a2d2..431bb6459273 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -141,7 +141,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
 	return ret;
 }
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -177,23 +177,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		return error;
 
 	/*
-	 * We still need a lock here for now to keep multiple FASYNC calls
-	 * from racing with each other.
+	 * ->fasync() is responsible for setting the FASYNC bit.
 	 */
-	lock_kernel();
-	if ((arg ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync) {
-			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
-			if (error < 0)
-				goto out;
-		}
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
+			filp->f_op->fasync) {
+		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+		if (error < 0)
+			goto out;
 	}
-
 	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 	spin_unlock(&filp->f_lock);
+
  out:
-	unlock_kernel();
 	return error;
 }
 
@@ -518,7 +514,7 @@ static DEFINE_RWLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 
 /*
- * fasync_helper() is used by some character device drivers (mainly mice)
+ * fasync_helper() is used by almost all character device drivers
  * to set up the fasync queue. It returns negative on error, 0 if it did
  * no changes and positive if it added/deleted the entry.
  */
@@ -557,6 +553,13 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		result = 1;
 	}
 out:
+	/* Fix up FASYNC bit while still holding fasync_lock */
+	spin_lock(&filp->f_lock);
+	if (on)
+		filp->f_flags |= FASYNC;
+	else
+		filp->f_flags &= ~FASYNC;
+	spin_unlock(&filp->f_lock);
 	write_unlock_irq(&fasync_lock);
 	return result;
 }
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 421aab465dab..e8e89edba576 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -427,19 +427,11 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync)
+			/* fasync() adjusts filp->f_flags */
 			error = filp->f_op->fasync(fd, filp, on);
 		else
 			error = -ENOTTY;
 	}
-	if (error)
-		return error;
-
-	spin_lock(&filp->f_lock);
-	if (on)
-		filp->f_flags |= FASYNC;
-	else
-		filp->f_flags &= ~FASYNC;
-	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -507,10 +499,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIOASYNC:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fioasync(fd, filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOQSIZE:
-- 
cgit v1.2.3


From 60aa49243d09afc873f082567d2e3c16634ced84 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 1 Feb 2009 14:52:56 -0700
Subject: Rationalize fasync return values

Most fasync implementations do something like:

     return fasync_helper(...);

But fasync_helper() will return a positive value at times - a feature used
in at least one place.  Thus, a number of other drivers do:

     err = fasync_helper(...);
     if (err < 0)
             return err;
     return 0;

In the interests of consistency and more concise code, it makes sense to
map positive return values onto zero where ->fasync() is called.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fcntl.c |  2 ++
 fs/ioctl.c |  2 +-
 fs/pipe.c  | 16 +++-------------
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 431bb6459273..d865ca66ccba 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -184,6 +184,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
 		if (error < 0)
 			goto out;
+		if (error > 0)
+			error = 0;
 	}
 	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e8e89edba576..ac2d47e43926 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -432,7 +432,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 		else
 			error = -ENOTTY;
 	}
-	return error;
+	return error < 0 ? error : 0;
 }
 
 static int ioctl_fsfreeze(struct file *filp)
diff --git a/fs/pipe.c b/fs/pipe.c
index 14f502b89cf5..94ad15967cf9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -667,10 +667,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -684,10 +681,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -706,11 +700,7 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
 	mutex_unlock(&inode->i_mutex);
-
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
-- 
cgit v1.2.3