From: "Hu, Boris" <boris.hu@intel.com>

Split futex global spinlock futex_lock into hash bucket spinlocks.
Add bucket spinlock recursively lock check fixed by Jakub Jelinek.

Ulrigh has regression tested this, and measured a 6% speedup on a 4-way with
a futex-intensive benchmark.



 kernel/futex.c |  117 ++++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 70 insertions(+), 47 deletions(-)

diff -puN kernel/futex.c~futex_lock-splitup kernel/futex.c
--- 25/kernel/futex.c~futex_lock-splitup	2003-09-16 22:25:51.000000000 -0700
+++ 25-akpm/kernel/futex.c	2003-09-16 22:25:51.000000000 -0700
@@ -33,7 +33,7 @@
 #include <linux/poll.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/hash.h>
+#include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/futex.h>
 #include <linux/mount.h>
@@ -44,6 +44,7 @@
 /*
  * Futexes are matched on equal values of this key.
  * The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
  */
 union futex_key {
 	struct {
@@ -79,9 +80,15 @@ struct futex_q {
 	struct file *filp;
 };
 
-/* The key for the hash is the address + index + offset within page */
-static struct list_head futex_queues[1<<FUTEX_HASHBITS];
-static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+       spinlock_t              lock;
+       struct list_head       chain;
+};
+
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
 /* Futex-fs vfsmount entry: */
 static struct vfsmount *futex_mnt;
@@ -89,11 +96,12 @@ static struct vfsmount *futex_mnt;
 /*
  * We hash on the keys returned from get_futex_key (see below).
  */
-static inline struct list_head *hash_futex(union futex_key *key)
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
 {
-	return &futex_queues[hash_long(key->both.word
-				       + (unsigned long) key->both.ptr
-				       + key->both.offset, FUTEX_HASHBITS)];
+	u32 hash = jhash2((u32*)&key->both.word,
+			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+			  key->both.offset);
+	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 }
 
 /*
@@ -214,6 +222,7 @@ static int get_futex_key(unsigned long u
 static int futex_wake(unsigned long uaddr, int num)
 {
 	struct list_head *i, *next, *head;
+	struct futex_hash_bucket *bh;
 	union futex_key key;
 	int ret;
 
@@ -223,9 +232,10 @@ static int futex_wake(unsigned long uadd
 	if (unlikely(ret != 0))
 		goto out;
 
-	head = hash_futex(&key);
+	bh = hash_futex(&key);
+	spin_lock(&bh->lock);
+	head = &bh->chain;
 
-	spin_lock(&futex_lock);
 	list_for_each_safe(i, next, head) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
@@ -239,7 +249,7 @@ static int futex_wake(unsigned long uadd
 				break;
 		}
 	}
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 
 out:
 	up_read(&current->mm->mmap_sem);
@@ -254,6 +264,7 @@ static int futex_requeue(unsigned long u
 				int nr_wake, int nr_requeue)
 {
 	struct list_head *i, *next, *head1, *head2;
+	struct futex_hash_bucket *bh1, *bh2;
 	union futex_key key1, key2;
 	int ret;
 
@@ -266,10 +277,19 @@ static int futex_requeue(unsigned long u
 	if (unlikely(ret != 0))
 		goto out;
 
-	head1 = hash_futex(&key1);
-	head2 = hash_futex(&key2);
+	bh1 = hash_futex(&key1);
+	bh2 = hash_futex(&key2);
+	if (bh1 < bh2) {
+		spin_lock(&bh1->lock);
+		spin_lock(&bh2->lock);
+	} else {
+		spin_lock(&bh2->lock);
+		if (bh1 > bh2)
+			spin_lock(&bh1->lock);
+	}
+	head1 = &bh1->chain;
+	head2 = &bh2->chain;
 
-	spin_lock(&futex_lock);
 	list_for_each_safe(i, next, head1) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
@@ -291,8 +311,14 @@ static int futex_requeue(unsigned long u
 			}
 		}
 	}
-	spin_unlock(&futex_lock);
-
+	if (bh1 < bh2) {
+		spin_unlock(&bh2->lock);
+		spin_unlock(&bh1->lock);
+	} else {
+		if (bh1 > bh2)
+			spin_unlock(&bh1->lock);
+		spin_unlock(&bh2->lock);
+	}
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -301,28 +327,30 @@ out:
 static inline void queue_me(struct futex_q *q, union futex_key *key,
 			    int fd, struct file *filp)
 {
-	struct list_head *head = hash_futex(key);
+	struct futex_hash_bucket *bh = hash_futex(key);
+	struct list_head *head = &bh->chain;
 
 	q->key = *key;
 	q->fd = fd;
 	q->filp = filp;
 
-	spin_lock(&futex_lock);
+	spin_lock(&bh->lock);
 	list_add_tail(&q->list, head);
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static inline int unqueue_me(struct futex_q *q)
 {
+	struct futex_hash_bucket *bh = hash_futex(&q->key);
 	int ret = 0;
 
-	spin_lock(&futex_lock);
+	spin_lock(&bh->lock);
 	if (!list_empty(&q->list)) {
 		list_del(&q->list);
 		ret = 1;
 	}
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 	return ret;
 }
 
@@ -332,8 +360,8 @@ static int futex_wait(unsigned long uadd
 	int ret, curval;
 	union futex_key key;
 	struct futex_q q;
+	struct futex_hash_bucket *bh = NULL;
 
- try_again:
 	init_waitqueue_head(&q.waiters);
 
 	down_read(&current->mm->mmap_sem);
@@ -367,25 +395,26 @@ static int futex_wait(unsigned long uadd
 	/*
 	 * There might have been scheduling since the queue_me(), as we
 	 * cannot hold a spinlock across the get_user() in case it
-	 * faults.  So we cannot just set TASK_INTERRUPTIBLE state when
+	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
 	 * queueing ourselves into the futex hash.  This code thus has to
-	 * rely on the futex_wake() code doing a wakeup after removing
-	 * the waiter from the list.
+	 * rely on the futex_wake() code removing us from hash when it
+	 * wakes us up.
 	 */
 	add_wait_queue(&q.waiters, &wait);
-	spin_lock(&futex_lock);
+	bh = hash_futex(&key);
+	spin_lock(&bh->lock);
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	if (unlikely(list_empty(&q.list))) {
 		/*
 		 * We were woken already.
 		 */
-		spin_unlock(&futex_lock);
+		spin_unlock(&bh->lock);
 		set_current_state(TASK_RUNNING);
 		return 0;
 	}
 
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 	time = schedule_timeout(time);
 	set_current_state(TASK_RUNNING);
 
@@ -394,26 +423,17 @@ static int futex_wait(unsigned long uadd
 	 * we are the only user of it.
 	 */
 
-	/*
-	 * Were we woken or interrupted for a valid reason?
-	 */
-	ret = unqueue_me(&q);
-	if (ret == 0)
+	/* If we were woken (and unqueued), we succeeded, whatever. */
+	if (!unqueue_me(&q))
 		return 0;
 	if (time == 0)
 		return -ETIMEDOUT;
-	if (signal_pending(current))
-		return -EINTR;
-
-	/*
-	 * No, it was a spurious wakeup.  Try again.  Should never happen. :)
-	 */
-	goto try_again;
+	/* A spurious wakeup should never happen. */
+	WARN_ON(!signal_pending(current));
+	return -EINTR;
 
  out_unqueue:
-	/*
-	 * Were we unqueued anyway?
-	 */
+	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		ret = 0;
  out_release_sem:
@@ -435,13 +455,14 @@ static unsigned int futex_poll(struct fi
 			       struct poll_table_struct *wait)
 {
 	struct futex_q *q = filp->private_data;
+	struct futex_hash_bucket *bh = hash_futex(&q->key);
 	int ret = 0;
 
 	poll_wait(filp, &q->waiters, wait);
-	spin_lock(&futex_lock);
+	spin_lock(&bh->lock);
 	if (list_empty(&q->list))
 		ret = POLLIN | POLLRDNORM;
-	spin_unlock(&futex_lock);
+	spin_unlock(&bh->lock);
 
 	return ret;
 }
@@ -587,8 +608,10 @@ static int __init init(void)
 	register_filesystem(&futex_fs_type);
 	futex_mnt = kern_mount(&futex_fs_type);
 
-	for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
-		INIT_LIST_HEAD(&futex_queues[i]);
+	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+		INIT_LIST_HEAD(&futex_queues[i].chain);
+		futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+	}
 	return 0;
 }
 __initcall(init);

_