aboutsummaryrefslogtreecommitdiff
path: root/ipc
diff options
context:
space:
mode:
Diffstat (limited to 'ipc')
-rw-r--r--ipc/ipc_sysctl.c20
-rw-r--r--ipc/msg.c13
-rw-r--r--ipc/sem.c222
-rw-r--r--ipc/util.c27
4 files changed, 198 insertions, 84 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 130dfece27a..b0e99deb6d0 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -62,7 +62,7 @@ static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
return err;
}
-static int proc_ipc_callback_dointvec(ctl_table *table, int write,
+static int proc_ipc_callback_dointvec_minmax(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -72,7 +72,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
memcpy(&ipc_table, table, sizeof(ipc_table));
ipc_table.data = get_ipc(table);
- rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
+ rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
if (write && !rc && lenp_bef == *lenp)
/*
@@ -152,15 +152,13 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
#define proc_ipc_dointvec NULL
#define proc_ipc_dointvec_minmax NULL
#define proc_ipc_dointvec_minmax_orphans NULL
-#define proc_ipc_callback_dointvec NULL
+#define proc_ipc_callback_dointvec_minmax NULL
#define proc_ipcauto_dointvec_minmax NULL
#endif
static int zero;
static int one = 1;
-#ifdef CONFIG_CHECKPOINT_RESTORE
static int int_max = INT_MAX;
-#endif
static struct ctl_table ipc_kern_table[] = {
{
@@ -198,21 +196,27 @@ static struct ctl_table ipc_kern_table[] = {
.data = &init_ipc_ns.msg_ctlmax,
.maxlen = sizeof (init_ipc_ns.msg_ctlmax),
.mode = 0644,
- .proc_handler = proc_ipc_dointvec,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
},
{
.procname = "msgmni",
.data = &init_ipc_ns.msg_ctlmni,
.maxlen = sizeof (init_ipc_ns.msg_ctlmni),
.mode = 0644,
- .proc_handler = proc_ipc_callback_dointvec,
+ .proc_handler = proc_ipc_callback_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
},
{
.procname = "msgmnb",
.data = &init_ipc_ns.msg_ctlmnb,
.maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
.mode = 0644,
- .proc_handler = proc_ipc_dointvec,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
},
{
.procname = "sem",
diff --git a/ipc/msg.c b/ipc/msg.c
index 9e4310c546a..558aa91186b 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -695,6 +695,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
if (ipcperms(ns, &msq->q_perm, S_IWUGO))
goto out_unlock0;
+ /* raced with RMID? */
+ if (msq->q_perm.deleted) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
err = security_msg_queue_msgsnd(msq, msg, msgflg);
if (err)
goto out_unlock0;
@@ -901,6 +907,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
goto out_unlock1;
ipc_lock_object(&msq->q_perm);
+
+ /* raced with RMID? */
+ if (msq->q_perm.deleted) {
+ msg = ERR_PTR(-EIDRM);
+ goto out_unlock0;
+ }
+
msg = find_msg(msq, &msgtyp, mode);
if (!IS_ERR(msg)) {
/*
diff --git a/ipc/sem.c b/ipc/sem.c
index 19c8b980d1f..db9d241af13 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -253,70 +253,112 @@ static void sem_rcu_free(struct rcu_head *head)
}
/*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+ int i;
+ struct sem *sem;
+
+ if (sma->complex_count) {
+ /* The thread that increased sma->complex_count waited on
+ * all sem->lock locks. Thus we don't need to wait again.
+ */
+ return;
+ }
+
+ for (i = 0; i < sma->sem_nsems; i++) {
+ sem = sma->sem_base + i;
+ spin_unlock_wait(&sem->lock);
+ }
+}
+
+/*
* If the request contains only one semaphore operation, and there are
* no complex transactions pending, lock only the semaphore involved.
* Otherwise, lock the entire semaphore array, since we either have
* multiple semaphores in our own semops, or we need to look at
* semaphores from other pending complex operations.
- *
- * Carefully guard against sma->complex_count changing between zero
- * and non-zero while we are spinning for the lock. The value of
- * sma->complex_count cannot change while we are holding the lock,
- * so sem_unlock should be fine.
- *
- * The global lock path checks that all the local locks have been released,
- * checking each local lock once. This means that the local lock paths
- * cannot start their critical sections while the global lock is held.
*/
static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
int nsops)
{
- int locknum;
- again:
- if (nsops == 1 && !sma->complex_count) {
- struct sem *sem = sma->sem_base + sops->sem_num;
+ struct sem *sem;
- /* Lock just the semaphore we are interested in. */
- spin_lock(&sem->lock);
+ if (nsops != 1) {
+ /* Complex operation - acquire a full lock */
+ ipc_lock_object(&sma->sem_perm);
- /*
- * If sma->complex_count was set while we were spinning,
- * we may need to look at things we did not lock here.
+ /* And wait until all simple ops that are processed
+ * right now have dropped their locks.
*/
- if (unlikely(sma->complex_count)) {
- spin_unlock(&sem->lock);
- goto lock_array;
- }
+ sem_wait_array(sma);
+ return -1;
+ }
+
+ /*
+ * Only one semaphore affected - try to optimize locking.
+ * The rules are:
+ * - optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ * - The test for enqueued complex ops is simple:
+ * sma->complex_count != 0
+ * - Testing for complex ops that are processed right now is
+ * a bit more difficult. Complex ops acquire the full lock
+ * and first wait that the running simple ops have completed.
+ * (see above)
+ * Thus: If we own a simple lock and the global lock is free
+ * and complex_count is now 0, then it will stay 0 and
+ * thus just locking sem->lock is sufficient.
+ */
+ sem = sma->sem_base + sops->sem_num;
+ if (sma->complex_count == 0) {
/*
- * Another process is holding the global lock on the
- * sem_array; we cannot enter our critical section,
- * but have to wait for the global lock to be released.
+ * It appears that no complex operation is around.
+ * Acquire the per-semaphore lock.
*/
- if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
- spin_unlock(&sem->lock);
- spin_unlock_wait(&sma->sem_perm.lock);
- goto again;
+ spin_lock(&sem->lock);
+
+ /* Then check that the global lock is free */
+ if (!spin_is_locked(&sma->sem_perm.lock)) {
+ /* spin_is_locked() is not a memory barrier */
+ smp_mb();
+
+ /* Now repeat the test of complex_count:
+ * It can't change anymore until we drop sem->lock.
+ * Thus: if is now 0, then it will stay 0.
+ */
+ if (sma->complex_count == 0) {
+ /* fast path successful! */
+ return sops->sem_num;
+ }
}
+ spin_unlock(&sem->lock);
+ }
+
+ /* slow path: acquire the full lock */
+ ipc_lock_object(&sma->sem_perm);
- locknum = sops->sem_num;
+ if (sma->complex_count == 0) {
+ /* False alarm:
+ * There is no complex operation, thus we can switch
+ * back to the fast path.
+ */
+ spin_lock(&sem->lock);
+ ipc_unlock_object(&sma->sem_perm);
+ return sops->sem_num;
} else {
- int i;
- /*
- * Lock the semaphore array, and wait for all of the
- * individual semaphore locks to go away. The code
- * above ensures no new single-lock holders will enter
- * their critical section while the array lock is held.
+ /* Not a false alarm, thus complete the sequence for a
+ * full lock.
*/
- lock_array:
- ipc_lock_object(&sma->sem_perm);
- for (i = 0; i < sma->sem_nsems; i++) {
- struct sem *sem = sma->sem_base + i;
- spin_unlock_wait(&sem->lock);
- }
- locknum = -1;
+ sem_wait_array(sma);
+ return -1;
}
- return locknum;
}
static inline void sem_unlock(struct sem_array *sma, int locknum)
@@ -876,6 +918,24 @@ again:
}
/**
+ * set_semotime(sma, sops) - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+ if (sops == NULL) {
+ sma->sem_base[0].sem_otime = get_seconds();
+ } else {
+ sma->sem_base[sops[0].sem_num].sem_otime =
+ get_seconds();
+ }
+}
+
+/**
* do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
* @sma: semaphore array
* @sops: operations that were performed
@@ -925,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
}
}
}
- if (otime) {
- if (sops == NULL) {
- sma->sem_base[0].sem_otime = get_seconds();
- } else {
- sma->sem_base[sops[0].sem_num].sem_otime =
- get_seconds();
- }
- }
+ if (otime)
+ set_semotime(sma, sops);
}
-
/* The following counts are associated to each semaphore:
* semncnt number of tasks waiting on semval being nonzero
* semzcnt number of tasks waiting on semval being zero
@@ -1229,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
sem_lock(sma, NULL, -1);
+ if (sma->sem_perm.deleted) {
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ return -EIDRM;
+ }
+
curr = &sma->sem_base[semnum];
ipc_assert_locked_object(&sma->sem_perm);
@@ -1283,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
int i;
sem_lock(sma, NULL, -1);
+ if (sma->sem_perm.deleted) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
if(nsems > SEMMSL_FAST) {
if (!ipc_rcu_getref(sma)) {
- sem_unlock(sma, -1);
- rcu_read_unlock();
err = -EIDRM;
- goto out_free;
+ goto out_unlock;
}
sem_unlock(sma, -1);
rcu_read_unlock();
@@ -1301,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
rcu_read_lock();
sem_lock_and_putref(sma);
if (sma->sem_perm.deleted) {
- sem_unlock(sma, -1);
- rcu_read_unlock();
err = -EIDRM;
- goto out_free;
+ goto out_unlock;
}
}
for (i = 0; i < sma->sem_nsems; i++)
@@ -1322,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
struct sem_undo *un;
if (!ipc_rcu_getref(sma)) {
- rcu_read_unlock();
- return -EIDRM;
+ err = -EIDRM;
+ goto out_rcu_wakeup;
}
rcu_read_unlock();
@@ -1351,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
rcu_read_lock();
sem_lock_and_putref(sma);
if (sma->sem_perm.deleted) {
- sem_unlock(sma, -1);
- rcu_read_unlock();
err = -EIDRM;
- goto out_free;
+ goto out_unlock;
}
for (i = 0; i < nsems; i++)
@@ -1378,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
goto out_rcu_wakeup;
sem_lock(sma, NULL, -1);
+ if (sma->sem_perm.deleted) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
curr = &sma->sem_base[semnum];
switch (cmd) {
@@ -1783,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (error)
goto out_rcu_wakeup;
+ error = -EIDRM;
+ locknum = sem_lock(sma, sops, nsops);
+ if (sma->sem_perm.deleted)
+ goto out_unlock_free;
/*
* semid identifiers are not unique - find_alloc_undo may have
* allocated an undo structure, it was invalidated by an RMID
@@ -1790,19 +1855,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
* This case can be detected checking un->semid. The existence of
* "un" itself is guaranteed by rcu.
*/
- error = -EIDRM;
- locknum = sem_lock(sma, sops, nsops);
if (un && un->semid == -1)
goto out_unlock_free;
error = perform_atomic_semop(sma, sops, nsops, un,
task_tgid_vnr(current));
- if (error <= 0) {
- if (alter && error == 0)
+ if (error == 0) {
+ /* If the operation was successful, then do
+ * the required updates.
+ */
+ if (alter)
do_smart_update(sma, sops, nsops, 1, &tasks);
-
- goto out_unlock_free;
+ else
+ set_semotime(sma, sops);
}
+ if (error <= 0)
+ goto out_unlock_free;
/* We need to sleep on this operation, so we put the current
* task into the pending queue and go to sleep.
@@ -1999,6 +2067,12 @@ void exit_sem(struct task_struct *tsk)
}
sem_lock(sma, NULL, -1);
+ /* exit_sem raced with IPC_RMID, nothing to do */
+ if (sma->sem_perm.deleted) {
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ continue;
+ }
un = __lookup_undo(ulp, semid);
if (un == NULL) {
/* exit_sem raced with IPC_RMID+semget() that created
@@ -2061,6 +2135,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
struct sem_array *sma = it;
time_t sem_otime;
+ /*
+ * The proc interface isn't aware of sem_lock(), it calls
+ * ipc_lock_object() directly (in sysvipc_find_ipc).
+ * In order to stay compatible with sem_lock(), we must wait until
+ * all simple semop() calls have left their critical regions.
+ */
+ sem_wait_array(sma);
+
sem_otime = get_semotime(sma);
return seq_printf(s,
diff --git a/ipc/util.c b/ipc/util.c
index fdb8ae74077..7684f41bce7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -17,12 +17,27 @@
* Pavel Emelianov <xemul@openvz.org>
*
* General sysv ipc locking scheme:
- * when doing ipc id lookups, take the ids->rwsem
- * rcu_read_lock()
- * obtain the ipc object (kern_ipc_perm)
- * perform security, capabilities, auditing and permission checks, etc.
- * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object()
- * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands)
+ * rcu_read_lock()
+ * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr
+ * tree.
+ * - perform initial checks (capabilities, auditing and permission,
+ * etc).
+ * - perform read-only operations, such as STAT, INFO commands.
+ * acquire the ipc lock (kern_ipc_perm.lock) through
+ * ipc_lock_object()
+ * - perform data updates, such as SET, RMID commands and
+ * mechanism-specific operations (semop/semtimedop,
+ * msgsnd/msgrcv, shmat/shmdt).
+ * drop the ipc lock, through ipc_unlock_object().
+ * rcu_read_unlock()
+ *
+ * The ids->rwsem must be taken when:
+ * - creating, removing and iterating the existing entries in ipc
+ * identifier sets.
+ * - iterating through files under /proc/sysvipc/
+ *
+ * Note that sems have a special fast path that avoids kern_ipc_perm.lock -
+ * see sem_lock().
*/
#include <linux/mm.h>