From: Stephen Tweedie <sct@redhat.com>

There is a race in the existing reservations code: while we are allocating
from a reservation, another thread may be discarding the fd's existing
reservation as part of a separate allocation.

To avoid locking the reservation during the bitmap search for a free block, we
just take a temporary copy of the existing reservation during that search.  To
minimise the cost, we use a seqlock to protect the per-filp reservation, using
a seqlock read when taking the temporary copy and locking for write only when
taking out a new reservation.

Fixes http://bugme.osdl.org/show_bug.cgi?id=3171

Signed-off-by: Stephen Tweedie <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/fs/ext3/balloc.c          |   35 ++++++++++++++++++++++++-----------
 25-akpm/fs/ext3/ialloc.c          |    3 ++-
 25-akpm/fs/ext3/inode.c           |    1 +
 25-akpm/fs/ext3/super.c           |    2 +-
 25-akpm/include/linux/ext3_fs_i.h |    4 +++-
 5 files changed, 31 insertions(+), 14 deletions(-)

diff -puN fs/ext3/balloc.c~ext3-reservations-smp-protect-the-reservation-during-allocation fs/ext3/balloc.c
--- 25/fs/ext3/balloc.c~ext3-reservations-smp-protect-the-reservation-during-allocation	2004-09-26 17:18:05.770781160 -0700
+++ 25-akpm/fs/ext3/balloc.c	2004-09-26 17:18:05.782779336 -0700
@@ -250,7 +250,7 @@ static void rsv_window_remove(struct sup
 {
 	rsv->rsv_start = 0;
 	rsv->rsv_end = 0;
-	rsv->rsv_alloc_hit = 0;
+	atomic_set(&rsv->rsv_alloc_hit, 0);
 	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
 }
 
@@ -856,7 +856,8 @@ static int alloc_new_reservation(struct 
 		if (my_rsv->rsv_end + 1 > start_block)
 			start_block = my_rsv->rsv_end + 1;
 		search_head = my_rsv;
-		if ((my_rsv->rsv_alloc_hit > (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+		if ((atomic_read(&my_rsv->rsv_alloc_hit) >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
 			/*
 			 * if we previously allocation hit ration is greater than half
 			 * we double the size of reservation window next time
@@ -1035,27 +1036,39 @@ ext3_try_to_allocate_with_rsv(struct sup
 	 * then we could go to allocate from the reservation window directly.
 	 */
 	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						goal, group, sb)) {
+		struct reserve_window rsv_copy;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&my_rsv->rsv_seqlock);
+			rsv_copy._rsv_start = my_rsv->rsv_start;
+			rsv_copy._rsv_end = my_rsv->rsv_end;
+		} while (read_seqretry(&my_rsv->rsv_seqlock, seq));
+
+		if (rsv_is_empty(&rsv_copy) || (ret < 0) ||
+			!goal_in_my_reservation(&rsv_copy, goal, group, sb)) {
 			spin_lock(rsv_lock);
+			write_seqlock(&my_rsv->rsv_seqlock);
 			ret = alloc_new_reservation(my_rsv, goal, sb,
 							group, bitmap_bh);
+			rsv_copy._rsv_start = my_rsv->rsv_start;
+			rsv_copy._rsv_end = my_rsv->rsv_end;
+			write_sequnlock(&my_rsv->rsv_seqlock);
 			spin_unlock(rsv_lock);
 			if (ret < 0)
 				break;			/* failed */
 
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-						    goal, group, sb))
+			if (!goal_in_my_reservation(&rsv_copy, goal, group, sb))
 				goal = -1;
 		}
-		if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
-			|| (my_rsv->rsv_end < group_first_block))
+		if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
+		    || (rsv_copy._rsv_end < group_first_block))
 			BUG();
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
-					   &my_rsv->rsv_window);
+					   &rsv_copy);
 		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit++;
+			if (!read_seqretry(&my_rsv->rsv_seqlock, seq))
+				atomic_inc(&my_rsv->rsv_alloc_hit);
 			break;				/* succeed */
 		}
 	}
diff -puN fs/ext3/ialloc.c~ext3-reservations-smp-protect-the-reservation-during-allocation fs/ext3/ialloc.c
--- 25/fs/ext3/ialloc.c~ext3-reservations-smp-protect-the-reservation-during-allocation	2004-09-26 17:18:05.771781008 -0700
+++ 25-akpm/fs/ext3/ialloc.c	2004-09-26 17:18:05.782779336 -0700
@@ -585,7 +585,8 @@ got:
 	ei->i_rsv_window.rsv_start = 0;
 	ei->i_rsv_window.rsv_end = 0;
 	atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
-	ei->i_rsv_window.rsv_alloc_hit = 0;
+	atomic_set(&ei->i_rsv_window.rsv_alloc_hit, 0);
+	seqlock_init(&ei->i_rsv_window.rsv_seqlock);
 	ei->i_block_group = group;
 
 	ext3_set_inode_flags(inode);
diff -puN fs/ext3/inode.c~ext3-reservations-smp-protect-the-reservation-during-allocation fs/ext3/inode.c
--- 25/fs/ext3/inode.c~ext3-reservations-smp-protect-the-reservation-during-allocation	2004-09-26 17:18:05.773780704 -0700
+++ 25-akpm/fs/ext3/inode.c	2004-09-26 17:18:05.785778880 -0700
@@ -2461,6 +2461,7 @@ void ext3_read_inode(struct inode * inod
 	ei->i_rsv_window.rsv_start = 0;
 	ei->i_rsv_window.rsv_end= 0;
 	atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
+	seqlock_init(&ei->i_rsv_window.rsv_seqlock);
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
diff -puN fs/ext3/super.c~ext3-reservations-smp-protect-the-reservation-during-allocation fs/ext3/super.c
--- 25/fs/ext3/super.c~ext3-reservations-smp-protect-the-reservation-during-allocation	2004-09-26 17:18:05.776780248 -0700
+++ 25-akpm/fs/ext3/super.c	2004-09-26 17:18:05.787778576 -0700
@@ -1486,7 +1486,7 @@ static int ext3_fill_super (struct super
 	 * _much_ simpler. */
 	sbi->s_rsv_window_head.rsv_start = 0;
 	sbi->s_rsv_window_head.rsv_end = 0;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	atomic_set(&sbi->s_rsv_window_head.rsv_alloc_hit, 0);
 	atomic_set(&sbi->s_rsv_window_head.rsv_goal_size, 0);
 	rsv_window_add(sb, &sbi->s_rsv_window_head);
 
diff -puN include/linux/ext3_fs_i.h~ext3-reservations-smp-protect-the-reservation-during-allocation include/linux/ext3_fs_i.h
--- 25/include/linux/ext3_fs_i.h~ext3-reservations-smp-protect-the-reservation-during-allocation	2004-09-26 17:18:05.777780096 -0700
+++ 25-akpm/include/linux/ext3_fs_i.h	2004-09-26 17:18:05.788778424 -0700
@@ -18,6 +18,7 @@
 
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
+#include <linux/seqlock.h>
 
 struct reserve_window {
 	__u32			_rsv_start;	/* First byte reserved */
@@ -27,7 +28,8 @@ struct reserve_window {
 struct reserve_window_node {
 	struct rb_node	 	rsv_node;
 	atomic_t		rsv_goal_size;
-	__u32			rsv_alloc_hit;
+	atomic_t		rsv_alloc_hit;
+	seqlock_t		rsv_seqlock;
 	struct reserve_window	rsv_window;
 };
 
_