From: Nick Piggin <nickpiggin@yahoo.com.au>

Introduce the notion of cooperating processes (those that submit requests
close to one another), and use these statistics to make better choices about
whether or not to do anticipatory waiting.

Help and analysis from Seetharami Seelam <seelam@cs.utep.edu>

Performance testing from Seelam:

I set up my system and executed a couple of tests that I used for OLS.  I
tested with AS, cooperative process patch merged in -mm tree (which I called
Nick, below) and the cooperative patch with modifications to as_update_iohist
(which I called Seelam).

I used a dual-processor (2.28GHz Pentium 4 Xeon) system, with 1 GB main memory
and 1 MB L2 cache, running Linux 2.6.9.  Only a single processor is used for
the experiments.  I used 7.2K RPM Maxtor 10GB drive configured with ext2 file
system.

Experiment 1 (ex1) consists of reading  one Linux source trees using

  find . -type f -exec cat '{}' ';' > /dev/null.

Experiment 2 (ex2) consists of reading two disjoint Linux source trees  
using

  find . -type f -exec cat '{}' ';' > /dev/null.

Experiment 3 (ex3) consists of streaming read of a 2GB file in the background
and 1 instance of the chunk reads in Experiment 1.

Timings for reading the Linux source are shown below:

             AS                     Nick          Seelam
ex1:      0m25.813s               0m27.859s      0m27.640s
ex2:      1m11.468s               1m13.918s      1m5.869s
ex3:      81m44.352s             10m38.572s      6m47.994s

The difference between the numbers in Experiment 3 must be due to the code in
as_update_iohist.  (akpm: that's not part of this patch.  So this patch is
"Nick").

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 drivers/block/as-iosched.c |  269 +++++++++++++++++++++++----------------------
 1 files changed, 141 insertions(+), 128 deletions(-)

diff -puN drivers/block/as-iosched.c~as-cooperating-processes drivers/block/as-iosched.c
--- devel/drivers/block/as-iosched.c~as-cooperating-processes	2005-09-10 23:59:24.000000000 -0700
+++ devel-akpm/drivers/block/as-iosched.c	2005-09-10 23:59:24.000000000 -0700
@@ -4,7 +4,7 @@
  *  Anticipatory & deadline i/o scheduler.
  *
  *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
- *                     Nick Piggin <piggin@cyberone.com.au>
+ *                     Nick Piggin <nickpiggin@yahoo.com.au>
  *
  */
 #include <linux/kernel.h>
@@ -103,6 +103,9 @@ struct as_data {
 
 	unsigned long exit_prob;	/* probability a task will exit while
 					   being waited on */
+	unsigned long exit_no_coop;	/* probablility an exitted task will
+					   not be part of a later cooperating
+					   request */
 	unsigned long new_ttime_total; 	/* mean thinktime on new proc */
 	unsigned long new_ttime_mean;
 	u64 new_seek_total;		/* mean seek on new proc */
@@ -629,34 +632,145 @@ static void as_antic_timeout(unsigned lo
 			/* process anticipated on has exitted or timed out*/
 			ad->exit_prob = (7*ad->exit_prob + 256)/8;
 		}
+		if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
+			/* process not "saved" by a cooperating request */
+			ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
+		}
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime)
+{
+	/* fixed point: 1.0 == 1<<8 */
+	if (aic->ttime_samples == 0) {
+		ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
+		ad->new_ttime_mean = ad->new_ttime_total / 256;
+
+		ad->exit_prob = (7*ad->exit_prob)/8;
+	}
+	aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
+	aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
+	aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
+}
+
+static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist)
+{
+	u64 total;
+
+	if (aic->seek_samples == 0) {
+		ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
+		ad->new_seek_mean = ad->new_seek_total / 256;
+	}
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc
+	 */
+	if (aic->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (aic->seek_mean * 4)	+ 2*1024*64);
+
+	aic->seek_samples = (7*aic->seek_samples + 256) / 8;
+	aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
+	total = aic->seek_total + (aic->seek_samples/2);
+	do_div(total, aic->seek_samples);
+	aic->seek_mean = (sector_t)total;
+}
+
+/*
+ * as_update_iohist keeps a decaying histogram of IO thinktimes, and
+ * updates @aic->ttime_mean based on that. It is called when a new
+ * request is queued.
+ */
+static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	int data_dir = arq->is_sync;
+	unsigned long thinktime = 0;
+	sector_t seek_dist;
+
+	if (aic == NULL)
+		return;
+
+	if (data_dir == REQ_SYNC) {
+		unsigned long in_flight = atomic_read(&aic->nr_queued)
+					+ atomic_read(&aic->nr_dispatched);
+		spin_lock(&aic->lock);
+		if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
+			test_bit(AS_TASK_IOSTARTED, &aic->state)) {
+			/* Calculate read -> read thinktime */
+			if (test_bit(AS_TASK_IORUNNING, &aic->state)
+							&& in_flight == 0) {
+				thinktime = jiffies - aic->last_end_request;
+				thinktime = min(thinktime, MAX_THINKTIME-1);
+			}
+			as_update_thinktime(ad, aic, thinktime);
+
+			/* Calculate read -> read seek distance */
+			if (aic->last_request_pos < rq->sector)
+				seek_dist = rq->sector - aic->last_request_pos;
+			else
+				seek_dist = aic->last_request_pos - rq->sector;
+			as_update_seekdist(ad, aic, seek_dist);
+		}
+		aic->last_request_pos = rq->sector + rq->nr_sectors;
+		set_bit(AS_TASK_IOSTARTED, &aic->state);
+		spin_unlock(&aic->lock);
+	}
+}
+
 /*
  * as_close_req decides if one request is considered "close" to the
  * previous one issued.
  */
-static int as_close_req(struct as_data *ad, struct as_rq *arq)
+static int as_close_req(struct as_data *ad, struct as_io_context *aic, struct as_rq *arq)
 {
 	unsigned long delay;	/* milliseconds */
 	sector_t last = ad->last_sector[ad->batch_data_dir];
 	sector_t next = arq->request->sector;
 	sector_t delta; /* acceptable close offset (in sectors) */
+	sector_t s;
 
 	if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
 		delay = 0;
 	else
 		delay = ((jiffies - ad->antic_start) * 1000) / HZ;
 
-	if (delay <= 1)
-		delta = 64;
+	if (delay == 0)
+		delta = 8192;
 	else if (delay <= 20 && delay <= ad->antic_expire)
-		delta = 64 << (delay-1);
+		delta = 8192 << delay;
 	else
 		return 1;
 
-	return (last - (delta>>1) <= next) && (next <= last + delta);
+	if ((last <= next + (delta>>1)) && (next <= last + delta))
+		return 1;
+
+	if (last < next)
+		s = next - last;
+	else
+		s = last - next;
+
+	if (aic->seek_samples == 0) {
+		/*
+		 * Process has just started IO. Use past statistics to
+		 * guage success possibility
+		 */
+		if (ad->new_seek_mean > s) {
+			/* this request is better than what we're expecting */
+			return 1;
+		}
+
+	} else {
+		if (aic->seek_mean > s) {
+			/* this request is better than what we're expecting */
+			return 1;
+		}
+	}
+
+	return 0;
 }
 
 /*
@@ -676,7 +790,6 @@ static int as_can_break_anticipation(str
 {
 	struct io_context *ioc;
 	struct as_io_context *aic;
-	sector_t s;
 
 	ioc = ad->io_context;
 	BUG_ON(!ioc);
@@ -698,13 +811,6 @@ static int as_can_break_anticipation(str
 	if (!aic)
 		return 0;
 
-	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-		/* process anticipated on has exitted */
-		if (aic->ttime_samples == 0)
-			ad->exit_prob = (7*ad->exit_prob + 256)/8;
-		return 1;
-	}
-
 	if (atomic_read(&aic->nr_queued) > 0) {
 		/* process has more requests queued */
 		return 1;
@@ -715,57 +821,45 @@ static int as_can_break_anticipation(str
 		return 1;
 	}
 
-	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
+	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) {
 		/*
 		 * Found a close request that is not one of ours.
 		 *
-		 * This makes close requests from another process reset
-		 * our thinktime delay. Is generally useful when there are
+		 * This makes close requests from another process update
+		 * our IO history. Is generally useful when there are
 		 * two or more cooperating processes working in the same
 		 * area.
 		 */
-		spin_lock(&aic->lock);
-		aic->last_end_request = jiffies;
-		spin_unlock(&aic->lock);
+		if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
+			if (aic->ttime_samples == 0)
+				ad->exit_prob = (7*ad->exit_prob + 256)/8;
+
+			ad->exit_no_coop = (7*ad->exit_no_coop)/8;
+		}
+
+		as_update_iohist(ad, aic, arq->request);
 		return 1;
 	}
 
+	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
+		/* process anticipated on has exitted */
+		if (aic->ttime_samples == 0)
+			ad->exit_prob = (7*ad->exit_prob + 256)/8;
+
+		if (ad->exit_no_coop > 128)
+			return 1;
+	}
 
 	if (aic->ttime_samples == 0) {
 		if (ad->new_ttime_mean > ad->antic_expire)
 			return 1;
-		if (ad->exit_prob > 128)
+		if (ad->exit_prob * ad->exit_no_coop > 128*256)
 			return 1;
 	} else if (aic->ttime_mean > ad->antic_expire) {
 		/* the process thinks too much between requests */
 		return 1;
 	}
 
-	if (!arq)
-		return 0;
-
-	if (ad->last_sector[REQ_SYNC] < arq->request->sector)
-		s = arq->request->sector - ad->last_sector[REQ_SYNC];
-	else
-		s = ad->last_sector[REQ_SYNC] - arq->request->sector;
-
-	if (aic->seek_samples == 0) {
-		/*
-		 * Process has just started IO. Use past statistics to
-		 * guage success possibility
-		 */
-		if (ad->new_seek_mean > s) {
-			/* this request is better than what we're expecting */
-			return 1;
-		}
-
-	} else {
-		if (aic->seek_mean > s) {
-			/* this request is better than what we're expecting */
-			return 1;
-		}
-	}
-
 	return 0;
 }
 
@@ -805,88 +899,6 @@ static int as_can_anticipate(struct as_d
 	return 1;
 }
 
-static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime)
-{
-	/* fixed point: 1.0 == 1<<8 */
-	if (aic->ttime_samples == 0) {
-		ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
-		ad->new_ttime_mean = ad->new_ttime_total / 256;
-
-		ad->exit_prob = (7*ad->exit_prob)/8;
-	}
-	aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
-	aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
-	aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
-}
-
-static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist)
-{
-	u64 total;
-
-	if (aic->seek_samples == 0) {
-		ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
-		ad->new_seek_mean = ad->new_seek_total / 256;
-	}
-
-	/*
-	 * Don't allow the seek distance to get too large from the
-	 * odd fragment, pagein, etc
-	 */
-	if (aic->seek_samples <= 60) /* second&third seek */
-		sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
-	else
-		sdist = min(sdist, (aic->seek_mean * 4)	+ 2*1024*64);
-
-	aic->seek_samples = (7*aic->seek_samples + 256) / 8;
-	aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
-	total = aic->seek_total + (aic->seek_samples/2);
-	do_div(total, aic->seek_samples);
-	aic->seek_mean = (sector_t)total;
-}
-
-/*
- * as_update_iohist keeps a decaying histogram of IO thinktimes, and
- * updates @aic->ttime_mean based on that. It is called when a new
- * request is queued.
- */
-static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
-{
-	struct as_rq *arq = RQ_DATA(rq);
-	int data_dir = arq->is_sync;
-	unsigned long thinktime;
-	sector_t seek_dist;
-
-	if (aic == NULL)
-		return;
-
-	if (data_dir == REQ_SYNC) {
-		unsigned long in_flight = atomic_read(&aic->nr_queued)
-					+ atomic_read(&aic->nr_dispatched);
-		spin_lock(&aic->lock);
-		if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
-			test_bit(AS_TASK_IOSTARTED, &aic->state)) {
-			/* Calculate read -> read thinktime */
-			if (test_bit(AS_TASK_IORUNNING, &aic->state)
-							&& in_flight == 0) {
-				thinktime = jiffies - aic->last_end_request;
-				thinktime = min(thinktime, MAX_THINKTIME-1);
-			} else
-				thinktime = 0;
-			as_update_thinktime(ad, aic, thinktime);
-
-			/* Calculate read -> read seek distance */
-			if (aic->last_request_pos < rq->sector)
-				seek_dist = rq->sector - aic->last_request_pos;
-			else
-				seek_dist = aic->last_request_pos - rq->sector;
-			as_update_seekdist(ad, aic, seek_dist);
-		}
-		aic->last_request_pos = rq->sector + rq->nr_sectors;
-		set_bit(AS_TASK_IOSTARTED, &aic->state);
-		spin_unlock(&aic->lock);
-	}
-}
-
 /*
  * as_update_arq must be called whenever a request (arq) is added to
  * the sort_list. This function keeps caches up to date, and checks if the
@@ -1952,6 +1964,7 @@ static ssize_t as_est_show(struct as_dat
 	int pos = 0;
 
 	pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256);
+	pos += sprintf(page+pos, "%lu %% probability of exiting without a cooperating process submitting IO\n", 100*ad->exit_no_coop/256);
 	pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
 	pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean);
 
_