From: Ben Slusky <sluskyb@paranoiacs.org>

Modifying loop_copy_bio not to throw away pages fixes the deadlock under high
I/O.  The new loop_recycle_buffer subroutine allows the loop driver to
complete requests using fewer pages than it wanted.


 drivers/block/loop.c |  137 +++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 112 insertions(+), 25 deletions(-)

diff -puN drivers/block/loop.c~loop-recycle drivers/block/loop.c
--- 25/drivers/block/loop.c~loop-recycle	2003-12-29 22:33:22.000000000 -0800
+++ 25-akpm/drivers/block/loop.c	2003-12-29 22:33:22.000000000 -0800
@@ -426,7 +426,7 @@ static int loop_end_io_transfer(struct b
 	if (bio->bi_size)
 		return 1;
 
-	if (err || bio_rw(bio) == WRITE) {
+	if (err || (bio_rw(bio) == WRITE && bio->bi_vcnt == rbh->bi_vcnt)) {
 		bio_endio(rbh, rbh->bi_size, err);
 		if (atomic_dec_and_test(&lo->lo_pending))
 			up(&lo->lo_bh_mutex);
@@ -444,11 +444,11 @@ static struct bio *loop_copy_bio(struct 
 	int i;
 
 	if (bio_rw(rbh) != WRITE) {
-		bio = bio_clone(rbh, __GFP_NOWARN);
+		bio = bio_clone(rbh, GFP_NOIO);
 		return bio;
 	}
 
-	bio = bio_alloc(__GFP_NOWARN, rbh->bi_vcnt);
+	bio = bio_alloc(GFP_NOIO, rbh->bi_vcnt);
 	if (!bio)
 		return NULL;
 
@@ -458,27 +458,26 @@ static struct bio *loop_copy_bio(struct 
 	bio_for_each_segment(bv, rbh, i) {
 		struct bio_vec *bbv = &bio->bi_io_vec[i];
 
-		bbv->bv_page = alloc_page(__GFP_NOWARN|__GFP_HIGHMEM);
+		/* We need one page; the rest we can live without */
+		bbv->bv_page = alloc_page((bio->bi_vcnt ? __GFP_NOWARN : GFP_NOIO) | __GFP_HIGHMEM);
 		if (bbv->bv_page == NULL)
-			goto oom;
+			break;
 
-		bbv->bv_len = bv->bv_len;
 		bbv->bv_offset = bv->bv_offset;
+		bio->bi_size += (bbv->bv_len = bv->bv_len);
+		bio->bi_vcnt++;
+	}
+
+	/* Can't get anything done if we didn't get any pages */
+	if (unlikely(!bio->bi_vcnt)) {
+		bio_put(bio);
+		return NULL;
 	}
 
-	bio->bi_idx = rbh->bi_idx;
-	bio->bi_vcnt = rbh->bi_vcnt;
-	bio->bi_size = rbh->bi_size;
+	bio->bi_vcnt += (bio->bi_idx = rbh->bi_idx);
 	bio->bi_rw = rbh->bi_rw;
 
 	return bio;
-
-oom:
-	while (--i >= 0)
-		__free_page(bio->bi_io_vec[i].bv_page);
-
-	bio_put(bio);
-	return NULL;
 }
 
 static struct bio *loop_get_buffer(struct loop_device *lo, struct bio *rbh)
@@ -499,8 +498,10 @@ static struct bio *loop_get_buffer(struc
 		if (flags & PF_MEMALLOC)
 			current->flags |= PF_MEMALLOC;
 
-		if (bio == NULL)
+		if (unlikely(bio == NULL)) {
+			printk("loop: alloc failed\n");
 			blk_congestion_wait(WRITE, HZ/10);
+		}
 	} while (bio == NULL);
 
 	bio->bi_end_io = loop_end_io_transfer;
@@ -511,6 +512,71 @@ static struct bio *loop_get_buffer(struc
 	return bio;
 }
 
+static void loop_recycle_buffer(struct loop_device *lo, struct bio *bio)
+{
+	struct bio *rbh = bio->bi_private;
+	struct bio_vec *bv, *bbv, *rbv;
+	int i, flags, nvecs = bio->bi_vcnt - bio->bi_idx;
+
+	/*
+	 * Comments in ll_rw_blk.c reserve for generic_make_request the right to
+	 * "change bi_dev and bi_sector for remaps as it sees fit." Doh!
+	 * Workaround: reset the bi_bdev and recompute the starting sector for
+	 * the next write.
+	 */
+	bio->bi_bdev = lo->lo_device;
+	bio->bi_sector = rbh->bi_sector + (lo->lo_offset >> 9);
+	/* Clean up the flags too */
+	bio->bi_flags &= (~(BIO_POOL_MASK - 1) | (1 << BIO_UPTODATE));
+
+	/*
+	 * Move the allocated pages into position to transfer more data.
+	 */
+	__bio_for_each_segment(bv, bio, i, rbh->bi_idx) {
+		rbv = &rbh->bi_io_vec[i];
+		bbv = bv + nvecs;
+
+		/* Workaround -- see note above */
+		bio->bi_sector += rbv->bv_len >> 9;
+		if (i < bio->bi_idx)
+			continue;
+
+		if (i + nvecs < rbh->bi_vcnt) {
+			bbv->bv_page = bv->bv_page;
+			bbv->bv_offset = rbv->bv_offset;
+			bio->bi_size += (bbv->bv_len = rbv->bv_len);
+		} else
+			__free_page(bv->bv_page);
+		memset(bv, 0, sizeof(*bv));
+	}
+
+	bio->bi_idx = bio->bi_vcnt;
+	bio->bi_vcnt += nvecs;
+	bio->bi_vcnt = min(bio->bi_vcnt, rbh->bi_vcnt);
+
+	/*
+	 * If we need more pages, try to get some.
+	 * Clear PF_MEMALLOC to avoid consuming all available memory.
+	 */
+	flags = current->flags;
+	current->flags &= ~PF_MEMALLOC;
+
+	__bio_for_each_segment(rbv, rbh, i, bio->bi_vcnt) {
+		bv = &bio->bi_io_vec[i];
+
+		bv->bv_page = alloc_page(__GFP_NOWARN|__GFP_HIGHMEM);
+		if (bv->bv_page == NULL)
+			break;
+
+		bv->bv_offset = rbv->bv_offset;
+		bio->bi_size += (bv->bv_len = rbv->bv_len);
+		bio->bi_vcnt++;
+	}
+
+	if (flags & PF_MEMALLOC)
+		current->flags |= PF_MEMALLOC;
+}
+
 static int loop_transfer_bio(struct loop_device *lo,
 			     struct bio *to_bio, struct bio *from_bio)
 {
@@ -520,12 +586,14 @@ static int loop_transfer_bio(struct loop
 
 	IV = from_bio->bi_sector + (lo->lo_offset >> 9);
 
-	bio_for_each_segment(from_bvec, from_bio, i) {
-		to_bvec = &to_bio->bi_io_vec[i];
-		ret |= lo_do_transfer(lo, bio_data_dir(to_bio),
+	__bio_for_each_segment(to_bvec, to_bio, i, from_bio->bi_idx) {
+		from_bvec = &from_bio->bi_io_vec[i];
+		if (i >= to_bio->bi_idx) {
+			ret |= lo_do_transfer(lo, bio_data_dir(to_bio),
 				      to_bvec->bv_page, to_bvec->bv_offset,
 				      from_bvec->bv_page, from_bvec->bv_offset,
 				      from_bvec->bv_len, IV);
+		}
 		IV += from_bvec->bv_len >> 9;
 	}
 
@@ -592,16 +660,30 @@ inactive:
 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
 	int ret;
+	struct bio *rbh;
 
-	/*
-	 * For block backed loop, we know this is a READ
-	 */
 	if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
 		ret = do_bio_filebacked(lo, bio);
 		bio_endio(bio, bio->bi_size, ret);
-	} else {
-		struct bio *rbh = bio->bi_private;
+	} else if (bio_rw(bio) == WRITE) {
+		/*
+		 * Write complete, but more pages remain;
+		 * encrypt and write some more pages
+		 */
+		loop_recycle_buffer(lo, bio);
+
+		rbh = bio->bi_private;
+		if ((ret = loop_transfer_bio(lo, bio, rbh))) {
+			bio_endio(bio, bio->bi_size, ret);
+			return;
+		}
 
+		generic_make_request(bio);
+	} else {
+		/*
+		 * Read complete; do decryption now
+		 */
+		rbh = bio->bi_private;
 		ret = loop_transfer_bio(lo, bio, rbh);
 
 		bio_endio(rbh, rbh->bi_size, ret);
@@ -619,6 +701,7 @@ static int loop_thread(void *data)
 {
 	struct loop_device *lo = data;
 	struct bio *bio;
+	int x;
 
 	daemonize("loop%d", lo->lo_number);
 
@@ -653,7 +736,11 @@ static int loop_thread(void *data)
 			printk("loop: missing bio\n");
 			continue;
 		}
+
+		x = (lo->lo_flags & LO_FLAGS_DO_BMAP) || bio_rw(bio) != WRITE;
 		loop_handle_bio(lo, bio);
+		if (!x)
+			continue;
 
 		/*
 		 * upped both for pending work and tear-down, lo_pending

_