aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/percpu-rw-semaphore.txt27
-rw-r--r--fs/block_dev.c27
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/percpu-rwsem.h89
4 files changed, 135 insertions, 11 deletions
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
new file mode 100644
index 00000000000..eddd7709472
--- /dev/null
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
+Percpu rw semaphores
+--------------------
+
+Percpu rw semaphores is a new read-write semaphore design that is
+optimized for locking for reading.
+
+The problem with traditional read-write semaphores is that when multiple
+cores take the lock for reading, the cache line containing the semaphore
+is bouncing between L1 caches of the cores, causing performance
+degradation.
+
+Locking for reading it very fast, it uses RCU and it avoids any atomic
+instruction in the lock and unlock path. On the other hand, locking for
+writing is very expensive, it calls synchronize_rcu() that can take
+hundreds of microseconds.
+
+The lock is declared with "struct percpu_rw_semaphore" type.
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
+-ENOMEM on allocation failure.
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
+
+The lock is locked for read with percpu_down_read, percpu_up_read and
+for write with percpu_down_write, percpu_up_write.
+
+The idea of using RCU for optimized rw-lock was introduced by
+Eric Dumazet <eric.dumazet@gmail.com>.
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e..7eeb0635338 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
return -EINVAL;
/* Prevent starting I/O or mapping the device */
- down_write(&bdev->bd_block_size_semaphore);
+ percpu_down_write(&bdev->bd_block_size_semaphore);
/* Check that the block device is not memory mapped */
mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
if (!prio_tree_empty(&mapping->i_mmap) ||
!list_empty(&mapping->i_mmap_nonlinear)) {
mutex_unlock(&mapping->i_mmap_mutex);
- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
return -EBUSY;
}
mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
kill_bdev(bdev);
}
- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
return 0;
}
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
+
+ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+ kmem_cache_free(bdev_cachep, ei);
+ return NULL;
+ }
+
return &ei->vfs_inode;
}
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
+ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
+
kmem_cache_free(bdev_cachep, bdi);
}
@@ -491,7 +499,6 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex);
- init_rwsem(&bdev->bd_block_size_semaphore);
}
static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
ssize_t ret;
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
return ret;
}
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = err;
}
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
blk_finish_plug(&plug);
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
int ret;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_mmap(file, vma);
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
return ret;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e60bbd0225d..24e1229cdfe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
#include <linux/ioctl.h>
#include <linux/blk_types.h>
#include <linux/types.h>
+#include <linux/percpu-rwsem.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -726,7 +727,7 @@ struct block_device {
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
/* A semaphore that prevents I/O while block size is being changed */
- struct rw_semaphore bd_block_size_semaphore;
+ struct percpu_rw_semaphore bd_block_size_semaphore;
};
/*
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 00000000000..cf80f7e5277
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_PERCPU_RWSEM_H
+#define _LINUX_PERCPU_RWSEM_H
+
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+
+struct percpu_rw_semaphore {
+ unsigned __percpu *counters;
+ bool locked;
+ struct mutex mtx;
+};
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *p)
+{
+ rcu_read_lock();
+ if (unlikely(p->locked)) {
+ rcu_read_unlock();
+ mutex_lock(&p->mtx);
+ this_cpu_inc(*p->counters);
+ mutex_unlock(&p->mtx);
+ return;
+ }
+ this_cpu_inc(*p->counters);
+ rcu_read_unlock();
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *p)
+{
+ /*
+ * On X86, write operation in this_cpu_dec serves as a memory unlock
+ * barrier (i.e. memory accesses may be moved before the write, but
+ * no memory accesses are moved past the write).
+ * On other architectures this may not be the case, so we need smp_mb()
+ * there.
+ */
+#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
+ barrier();
+#else
+ smp_mb();
+#endif
+ this_cpu_dec(*p->counters);
+}
+
+static inline unsigned __percpu_count(unsigned __percpu *counters)
+{
+ unsigned total = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
+
+ return total;
+}
+
+static inline void percpu_down_write(struct percpu_rw_semaphore *p)
+{
+ mutex_lock(&p->mtx);
+ p->locked = true;
+ synchronize_rcu();
+ while (__percpu_count(p->counters))
+ msleep(1);
+ smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
+}
+
+static inline void percpu_up_write(struct percpu_rw_semaphore *p)
+{
+ p->locked = false;
+ mutex_unlock(&p->mtx);
+}
+
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
+{
+ p->counters = alloc_percpu(unsigned);
+ if (unlikely(!p->counters))
+ return -ENOMEM;
+ p->locked = false;
+ mutex_init(&p->mtx);
+ return 0;
+}
+
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
+{
+ free_percpu(p->counters);
+ p->counters = NULL; /* catch use after free bugs */
+}
+
+#endif