From: Nick Piggin <nickpiggin@yahoo.com.au>

When a process sleeps on contention, a "waiter" object is allocated on its
stack, and put onto a list.

Right before a sleeping process is woken, it gets a flag cleared
which allows it to break out of its sleeping loop:
-------------
        /* wait to be given the lock */
        for (;;) {
                if (!waiter->flags)
                        break;
                schedule();
                WARN_ON(waiter->flags);
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
        }

        tsk->state = TASK_RUNNING;
-------------

So, a spurious wakeup after the flag is cleared and before the "real"
wakeup can cause the waiter object to go off the stack while still in use. 
Or, if the flag is cleared and the real wake occurs *after* a spurious
wakeup but before schedule(), we lose that wakeup.

So fix this by using the completion API.  Significantly simplifies the
code.  

(Such a spurious wakeup can happen if, say, a process parks itself on a
waitqueue and then executes down_read())


---

 25-akpm/lib/rwsem-spinlock.c |   46 ++++++++++---------------------------------
 25-akpm/lib/rwsem.c          |   22 ++++++--------------
 2 files changed, 18 insertions(+), 50 deletions(-)

diff -puN lib/rwsem.c~de-racify-rwsem-take-2 lib/rwsem.c
--- 25/lib/rwsem.c~de-racify-rwsem-take-2	2004-04-14 19:39:05.574640824 -0700
+++ 25-akpm/lib/rwsem.c	2004-04-14 19:39:10.724857872 -0700
@@ -7,10 +7,12 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/completion.h>
 
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
+	struct completion granted;
 	unsigned int flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
@@ -72,8 +74,7 @@ try_again:
 		goto readers_only;
 
 	list_del(&waiter->list);
-	waiter->flags = 0;
-	wake_up_process(waiter->task);
+	complete(&waiter->granted);
 	goto out;
 
 	/* don't want to wake any writers */
@@ -109,8 +110,7 @@ readers_only:
 	for (; loop > 0; loop--) {
 		waiter = list_entry(next, struct rwsem_waiter, list);
 		next = waiter->list.next;
-		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		complete(&waiter->granted);
 	}
 
 	sem->wait_list.next = next;
@@ -130,18 +130,17 @@ undo:
 /*
  * wait for a lock to be granted
  */
-static inline struct rw_semaphore *
+static struct rw_semaphore *
 rwsem_down_failed_common(struct rw_semaphore *sem,
 			struct rwsem_waiter *waiter, signed long adjustment)
 {
 	struct task_struct *tsk = current;
 	signed long count;
 
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
 	spin_lock(&sem->wait_lock);
 	waiter->task = tsk;
+	init_completion(&waiter->granted);
 
 	list_add_tail(&waiter->list, &sem->wait_list);
 
@@ -159,14 +158,7 @@ rwsem_down_failed_common(struct rw_semap
 	spin_unlock(&sem->wait_lock);
 
 	/* wait to be given the lock */
-	for (;;) {
-		if (!waiter->flags)
-			break;
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-
-	tsk->state = TASK_RUNNING;
+	wait_for_completion(&waiter->granted);
 
 	return sem;
 }
diff -puN lib/rwsem-spinlock.c~de-racify-rwsem-take-2 lib/rwsem-spinlock.c
--- 25/lib/rwsem-spinlock.c~de-racify-rwsem-take-2	2004-04-14 19:39:05.575640672 -0700
+++ 25-akpm/lib/rwsem-spinlock.c	2004-04-14 19:39:05.581639760 -0700
@@ -8,10 +8,12 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/completion.h>
 
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
+	struct completion granted;
 	unsigned int flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
@@ -73,8 +75,7 @@ __rwsem_do_wake(struct rw_semaphore *sem
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
 		sem->activity = -1;
 		list_del(&waiter->list);
-		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		complete(&waiter->granted);
 		goto out;
 	}
 
@@ -86,8 +87,7 @@ dont_wake_writers:
 		struct list_head *next = waiter->list.next;
 
 		list_del(&waiter->list);
-		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		complete(&waiter->granted);
 		woken++;
 		if (list_empty(&sem->wait_list))
 			break;
@@ -114,8 +114,7 @@ __rwsem_wake_one_writer(struct rw_semaph
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	list_del(&waiter->list);
 
-	waiter->flags = 0;
-	wake_up_process(waiter->task);
+	complete(&waiter->granted);
 	return sem;
 }
 
@@ -125,7 +124,6 @@ __rwsem_wake_one_writer(struct rw_semaph
 void fastcall __down_read(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 
 	rwsemtrace(sem, "Entering __down_read");
 
@@ -138,11 +136,9 @@ void fastcall __down_read(struct rw_sema
 		goto out;
 	}
 
-	tsk = current;
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
-	waiter.task = tsk;
+	init_completion(&waiter.granted);
+	waiter.task = current;
 	waiter.flags = RWSEM_WAITING_FOR_READ;
 
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -150,15 +146,7 @@ void fastcall __down_read(struct rw_sema
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock(&sem->wait_lock);
 
-	/* wait to be given the lock */
-	for (;;) {
-		if (!waiter.flags)
-			break;
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-
-	tsk->state = TASK_RUNNING;
+	wait_for_completion(&waiter.granted);
 
 out:
 	rwsemtrace(sem, "Leaving __down_read");
@@ -194,7 +182,6 @@ int fastcall __down_read_trylock(struct 
 void fastcall __down_write(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 
 	rwsemtrace(sem, "Entering __down_write");
 
@@ -207,11 +194,9 @@ void fastcall __down_write(struct rw_sem
 		goto out;
 	}
 
-	tsk = current;
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
-	waiter.task = tsk;
+	init_completion(&waiter.granted);
+	waiter.task = current;
 	waiter.flags = RWSEM_WAITING_FOR_WRITE;
 
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -219,16 +204,7 @@ void fastcall __down_write(struct rw_sem
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock(&sem->wait_lock);
 
-	/* wait to be given the lock */
-	for (;;) {
-		if (!waiter.flags)
-			break;
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-
-	tsk->state = TASK_RUNNING;
-
+	wait_for_completion(&waiter.granted);
 out:
 	rwsemtrace(sem, "Leaving __down_write");
 }

_