From 5e9d527591421ccdb16acb8c23662231135d8686 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:04 -0700 Subject: ipc/sem.c: fix race in sem_lock() The exclusion of complex operations in sem_lock() is insufficient: after acquiring the per-semaphore lock, a simple op must first check that sem_perm.lock is not locked and only after that test check complex_count. The current code does it the other way around - and that creates a race. Details are below. The patch is a complete rewrite of sem_lock(), based in part on the code from Mike Galbraith. It removes all gotos and all loops and thus the risk of livelocks. I have tested the patch (together with the next one) on my i3 laptop and it didn't cause any problems. The bug is probably also present in 3.10 and 3.11, but for these kernels it might be simpler just to move the test of sma->complex_count after the spin_is_locked() test. Details of the bug: Assume: - sma->complex_count = 0. - Thread 1: semtimedop(complex op that must sleep) - Thread 2: semtimedop(simple op). Pseudo-Trace: Thread 1: sem_lock(): acquire sem_perm.lock Thread 1: sem_lock(): check for ongoing simple ops Nothing ongoing, thread 2 is still before sem_lock(). Thread 1: try_atomic_semop() <<< preempted. Thread 2: sem_lock(): static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { int locknum; again: if (nsops == 1 && !sma->complex_count) { struct sem *sem = sma->sem_base + sops->sem_num; /* Lock just the semaphore we are interested in. */ spin_lock(&sem->lock); /* * If sma->complex_count was set while we were spinning, * we may need to look at things we did not lock here. */ if (unlikely(sma->complex_count)) { spin_unlock(&sem->lock); goto lock_array; } <<<<<<<<< <<< complex_count is still 0. <<< <<< Here it is preempted <<<<<<<<< Thread 1: try_atomic_semop() returns, notices that it must sleep. Thread 1: increases sma->complex_count. Thread 1: drops sem_perm.lock Thread 2: /* * Another process is holding the global lock on the * sem_array; we cannot enter our critical section, * but have to wait for the global lock to be released. */ if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { spin_unlock(&sem->lock); spin_unlock_wait(&sma->sem_perm.lock); goto again; } <<< sem_perm.lock already dropped, thus no "goto again;" locknum = sops->sem_num; Signed-off-by: Manfred Spraul Cc: Mike Galbraith Cc: Rik van Riel Cc: Davidlohr Bueso Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 122 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 44 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index 19c8b980d1fe..4a92c0447ad6 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -252,71 +252,105 @@ static void sem_rcu_free(struct rcu_head *head) ipc_rcu_free(head); } +/* + * Wait until all currently ongoing simple ops have completed. + * Caller must own sem_perm.lock. + * New simple ops cannot start, because simple ops first check + * that sem_perm.lock is free. + */ +static void sem_wait_array(struct sem_array *sma) +{ + int i; + struct sem *sem; + + for (i = 0; i < sma->sem_nsems; i++) { + sem = sma->sem_base + i; + spin_unlock_wait(&sem->lock); + } +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. - * - * Carefully guard against sma->complex_count changing between zero - * and non-zero while we are spinning for the lock. The value of - * sma->complex_count cannot change while we are holding the lock, - * so sem_unlock should be fine. - * - * The global lock path checks that all the local locks have been released, - * checking each local lock once. This means that the local lock paths - * cannot start their critical sections while the global lock is held. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { - int locknum; - again: - if (nsops == 1 && !sma->complex_count) { - struct sem *sem = sma->sem_base + sops->sem_num; + struct sem *sem; - /* Lock just the semaphore we are interested in. */ - spin_lock(&sem->lock); + if (nsops != 1) { + /* Complex operation - acquire a full lock */ + ipc_lock_object(&sma->sem_perm); - /* - * If sma->complex_count was set while we were spinning, - * we may need to look at things we did not lock here. + /* And wait until all simple ops that are processed + * right now have dropped their locks. */ - if (unlikely(sma->complex_count)) { - spin_unlock(&sem->lock); - goto lock_array; - } + sem_wait_array(sma); + return -1; + } + + /* + * Only one semaphore affected - try to optimize locking. + * The rules are: + * - optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * - The test for enqueued complex ops is simple: + * sma->complex_count != 0 + * - Testing for complex ops that are processed right now is + * a bit more difficult. Complex ops acquire the full lock + * and first wait that the running simple ops have completed. + * (see above) + * Thus: If we own a simple lock and the global lock is free + * and complex_count is now 0, then it will stay 0 and + * thus just locking sem->lock is sufficient. + */ + sem = sma->sem_base + sops->sem_num; + if (sma->complex_count == 0) { /* - * Another process is holding the global lock on the - * sem_array; we cannot enter our critical section, - * but have to wait for the global lock to be released. + * It appears that no complex operation is around. + * Acquire the per-semaphore lock. */ - if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { - spin_unlock(&sem->lock); - spin_unlock_wait(&sma->sem_perm.lock); - goto again; + spin_lock(&sem->lock); + + /* Then check that the global lock is free */ + if (!spin_is_locked(&sma->sem_perm.lock)) { + /* spin_is_locked() is not a memory barrier */ + smp_mb(); + + /* Now repeat the test of complex_count: + * It can't change anymore until we drop sem->lock. + * Thus: if is now 0, then it will stay 0. + */ + if (sma->complex_count == 0) { + /* fast path successful! */ + return sops->sem_num; + } } + spin_unlock(&sem->lock); + } - locknum = sops->sem_num; + /* slow path: acquire the full lock */ + ipc_lock_object(&sma->sem_perm); + + if (sma->complex_count == 0) { + /* False alarm: + * There is no complex operation, thus we can switch + * back to the fast path. + */ + spin_lock(&sem->lock); + ipc_unlock_object(&sma->sem_perm); + return sops->sem_num; } else { - int i; - /* - * Lock the semaphore array, and wait for all of the - * individual semaphore locks to go away. The code - * above ensures no new single-lock holders will enter - * their critical section while the array lock is held. + /* Not a false alarm, thus complete the sequence for a + * full lock. */ - lock_array: - ipc_lock_object(&sma->sem_perm); - for (i = 0; i < sma->sem_nsems; i++) { - struct sem *sem = sma->sem_base + i; - spin_unlock_wait(&sem->lock); - } - locknum = -1; + sem_wait_array(sma); + return -1; } - return locknum; } static inline void sem_unlock(struct sem_array *sma, int locknum) -- cgit v1.2.3 From 6d07b68ce16ae9535955ba2059dedba5309c3ca1 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:06 -0700 Subject: ipc/sem.c: optimize sem_lock() Operations that need access to the whole array must guarantee that there are no simple operations ongoing. Right now this is achieved by spin_unlock_wait(sem->lock) on all semaphores. If complex_count is nonzero, then this spin_unlock_wait() is not necessary, because it was already performed in the past by the thread that increased complex_count and even though sem_perm.lock was dropped inbetween, no simple operation could have started, because simple operations cannot start when complex_count is non-zero. Signed-off-by: Manfred Spraul Cc: Mike Galbraith Cc: Rik van Riel Reviewed-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index 4a92c0447ad6..e20658d76bb5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -257,12 +257,20 @@ static void sem_rcu_free(struct rcu_head *head) * Caller must own sem_perm.lock. * New simple ops cannot start, because simple ops first check * that sem_perm.lock is free. + * that a) sem_perm.lock is free and b) complex_count is 0. */ static void sem_wait_array(struct sem_array *sma) { int i; struct sem *sem; + if (sma->complex_count) { + /* The thread that increased sma->complex_count waited on + * all sem->lock locks. Thus we don't need to wait again. + */ + return; + } + for (i = 0; i < sma->sem_nsems; i++) { sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); -- cgit v1.2.3 From d8c633766ad88527f25d9f81a5c2f083d78a2b39 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:07 -0700 Subject: ipc/sem.c: synchronize the proc interface The proc interface is not aware of sem_lock(), it instead calls ipc_lock_object() directly. This means that simple semop() operations can run in parallel with the proc interface. Right now, this is uncritical, because the implementation doesn't do anything that requires a proper synchronization. But it is dangerous and therefore should be fixed. Signed-off-by: Manfred Spraul Cc: Davidlohr Bueso Cc: Mike Galbraith Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index e20658d76bb5..cd6a733011a2 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2103,6 +2103,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) struct sem_array *sma = it; time_t sem_otime; + /* + * The proc interface isn't aware of sem_lock(), it calls + * ipc_lock_object() directly (in sysvipc_find_ipc). + * In order to stay compatible with sem_lock(), we must wait until + * all simple semop() calls have left their critical regions. + */ + sem_wait_array(sma); + sem_otime = get_semotime(sma); return seq_printf(s, -- cgit v1.2.3 From 0e8c665699e953fa58dc1b0b0d09e5dce7343cc7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 30 Sep 2013 13:45:25 -0700 Subject: ipc/sem.c: update sem_otime for all operations In commit 0a2b9d4c7967 ("ipc/sem.c: move wake_up_process out of the spinlock section"), the update of semaphore's sem_otime(last semop time) was moved to one central position (do_smart_update). But since do_smart_update() is only called for operations that modify the array, this means that wait-for-zero semops do not update sem_otime anymore. The fix is simple: Non-alter operations must update sem_otime. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Manfred Spraul Reported-by: Jia He Tested-by: Jia He Cc: Davidlohr Bueso Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'ipc') diff --git a/ipc/sem.c b/ipc/sem.c index cd6a733011a2..8c4f59b0204a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -917,6 +917,24 @@ again: return semop_completed; } +/** + * set_semotime(sma, sops) - set sem_otime + * @sma: semaphore array + * @sops: operations that modified the array, may be NULL + * + * sem_otime is replicated to avoid cache line trashing. + * This function sets one instance to the current time. + */ +static void set_semotime(struct sem_array *sma, struct sembuf *sops) +{ + if (sops == NULL) { + sma->sem_base[0].sem_otime = get_seconds(); + } else { + sma->sem_base[sops[0].sem_num].sem_otime = + get_seconds(); + } +} + /** * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue * @sma: semaphore array @@ -967,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop } } } - if (otime) { - if (sops == NULL) { - sma->sem_base[0].sem_otime = get_seconds(); - } else { - sma->sem_base[sops[0].sem_num].sem_otime = - get_seconds(); - } - } + if (otime) + set_semotime(sma, sops); } - /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero @@ -1839,12 +1850,17 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, error = perform_atomic_semop(sma, sops, nsops, un, task_tgid_vnr(current)); - if (error <= 0) { - if (alter && error == 0) + if (error == 0) { + /* If the operation was successful, then do + * the required updates. + */ + if (alter) do_smart_update(sma, sops, nsops, 1, &tasks); - - goto out_unlock_free; + else + set_semotime(sma, sops); } + if (error <= 0) + goto out_unlock_free; /* We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. -- cgit v1.2.3 From 4271b05a227dc6175b66c3d9941aeab09048aeb2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 30 Sep 2013 13:45:26 -0700 Subject: ipc,msg: prevent race with rmid in msgsnd,msgrcv This fixes a race in both msgrcv() and msgsnd() between finding the msg and actually dealing with the queue, as another thread can delete shmid underneath us if we are preempted before acquiring the kern_ipc_perm.lock. Manfred illustrates this nicely: Assume a preemptible kernel that is preempted just after msq = msq_obtain_object_check(ns, msqid) in do_msgrcv(). The only lock that is held is rcu_read_lock(). Now the other thread processes IPC_RMID. When the first task is resumed, then it will happily wait for messages on a deleted queue. Fix this by checking for if the queue has been deleted after taking the lock. Signed-off-by: Davidlohr Bueso Reported-by: Manfred Spraul Cc: Rik van Riel Cc: Mike Galbraith Cc: [3.11] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'ipc') diff --git a/ipc/msg.c b/ipc/msg.c index 9e4310c546ae..558aa91186b6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -695,6 +695,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock0; + /* raced with RMID? */ + if (msq->q_perm.deleted) { + err = -EIDRM; + goto out_unlock0; + } + err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) goto out_unlock0; @@ -901,6 +907,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl goto out_unlock1; ipc_lock_object(&msq->q_perm); + + /* raced with RMID? */ + if (msq->q_perm.deleted) { + msg = ERR_PTR(-EIDRM); + goto out_unlock0; + } + msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* -- cgit v1.2.3