From: Nick Piggin <piggin@cyberone.com.au> This one gathers better statistics about the new process problem. It improves estimation for initial process IO. That is, better calculations for whether it will be worth waiting after a process submits its first read. This is done with a per queue average thinktime and seek time for a second read submitted from a process. When combined with 3/3, numbers are around the same as mm1 for most long lived tasks, but much better for things like the top 4 benchmarks. Probably wants rwhron and the OSDL database guys to give it some testing. test 2.6.0-test9-mm1 2.6.0-test9-mm1-np Cat kernel source during seq read 0:26.89 0:24.75 Cat kernel source during seq write 9:17.80 0:23.48 ls -lr kernel source during seq read 0:11.03 0:14.68 ls -lr kernel source during seq write 0:49.95 0:08.06 contest no_load 143s 0 loads 144s 0 loads contest io_load 193s 40.2 loads 193s 40.1 loads contest read_load 186s 11.6 loads 190s 10.5 loads contest list_load 201s 5.0 loads 200s 5.0 loads pgbench 1 client 31.3TPS 31.5TPS pgbench 4 clients 37.7TPS 37.5TPS pgbench 16 clients 42.1TPS 48.1TPS 25-akpm/drivers/block/as-iosched.c | 209 +++++++++++++++++++++++-------------- 1 files changed, 131 insertions(+), 78 deletions(-) diff -puN drivers/block/as-iosched.c~as-new-process-estimation drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-new-process-estimation Fri Nov 7 15:25:38 2003 +++ 25-akpm/drivers/block/as-iosched.c Fri Nov 7 15:25:38 2003 @@ -70,6 +70,7 @@ /* Bits in as_io_context.state */ enum as_io_states { AS_TASK_RUNNING=0, /* Process has not exitted */ + AS_TASK_IOSTARTED, /* Process has started some IO */ AS_TASK_IORUNNING, /* Process has completed some IO */ }; @@ -99,6 +100,14 @@ struct as_data { sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ struct list_head *dispatch; /* driver dispatch queue */ struct list_head *hash; /* request hash */ + + unsigned long exit_prob; /* probability a task will exit while + being waited on */ + unsigned long new_ttime_total; /* mean thinktime on new proc */ + unsigned long new_ttime_mean; + u64 new_seek_total; /* mean seek on new proc */ + sector_t new_seek_mean; + unsigned long current_batch_expires; unsigned long last_check_fifo[2]; int changed_batch; /* 1: waiting for old batch to end */ @@ -186,6 +195,7 @@ static void free_as_io_context(struct as /* Called when the task exits */ static void exit_as_io_context(struct as_io_context *aic) { + WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); clear_bit(AS_TASK_RUNNING, &aic->state); } @@ -608,8 +618,15 @@ static void as_antic_timeout(unsigned lo spin_lock_irqsave(q->queue_lock, flags); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { + struct as_io_context *aic = ad->io_context->aic; + ad->antic_status = ANTIC_FINISHED; kblockd_schedule_work(&ad->antic_work); + + if (aic->ttime_samples == 0) { + /* process anticipated on has exitted or timed out*/ + ad->exit_prob = (7*ad->exit_prob + 256)/8; + } } spin_unlock_irqrestore(q->queue_lock, flags); } @@ -623,7 +640,7 @@ static int as_close_req(struct as_data * unsigned long delay; /* milliseconds */ sector_t last = ad->last_sector[ad->batch_data_dir]; sector_t next = arq->request->sector; - sector_t delta; /* acceptable close offset (in sectors) */ + sector_t delta; /* acceptable close offset (in sectors) */ if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; @@ -657,6 +674,15 @@ static int as_can_break_anticipation(str { struct io_context *ioc; struct as_io_context *aic; + sector_t s; + + ioc = ad->io_context; + BUG_ON(!ioc); + + if (arq && ioc == arq->io_context) { + /* request from same process */ + return 1; + } if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { /* close request */ @@ -671,20 +697,14 @@ static int as_can_break_anticipation(str return 1; } - ioc = ad->io_context; - BUG_ON(!ioc); - - if (arq && ioc == arq->io_context) { - /* request from same process */ - return 1; - } - aic = ioc->aic; if (!aic) return 0; if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exitted */ + if (aic->ttime_samples == 0) + ad->exit_prob = (7*ad->exit_prob + 256)/8; return 1; } @@ -698,27 +718,36 @@ static int as_can_break_anticipation(str return 1; } - if (aic->seek_samples == 0 || aic->ttime_samples == 0) { - /* - * Process has just started IO. Don't anticipate. - * TODO! Must fix this up. - */ - return 1; - } - - if (aic->ttime_mean > ad->antic_expire) { + if (aic->ttime_samples == 0) { + if (ad->new_ttime_mean > ad->antic_expire) + return 1; + if (ad->exit_prob > 128) + return 1; + } else if (aic->ttime_mean > ad->antic_expire) { /* the process thinks too much between requests */ return 1; } - if (arq && aic->seek_samples) { - sector_t s; - if (ad->last_sector[REQ_SYNC] < arq->request->sector) - s = arq->request->sector - ad->last_sector[REQ_SYNC]; - else - s = ad->last_sector[REQ_SYNC] - arq->request->sector; + if (!arq) + return 0; + + if (ad->last_sector[REQ_SYNC] < arq->request->sector) + s = arq->request->sector - ad->last_sector[REQ_SYNC]; + else + s = ad->last_sector[REQ_SYNC] - arq->request->sector; + + if (aic->seek_samples == 0) { + /* + * Process has just started IO. Use past statistics to + * guage success possibility + */ + if (ad->new_seek_mean/2 > s) { + /* this request is better than what we're expecting */ + return 1; + } - if (aic->seek_mean > (s>>1)) { + } else { + if (aic->seek_mean/2 > s) { /* this request is better than what we're expecting */ return 1; } @@ -763,12 +792,51 @@ static int as_can_anticipate(struct as_d return 1; } +static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime) +{ + /* fixed point: 1.0 == 1<<8 */ + if (aic->ttime_samples == 0) { + ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; + ad->new_ttime_mean = ad->new_ttime_total / 256; + + ad->exit_prob = (7*ad->exit_prob)/8; + } + aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; + aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; + aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; +} + +static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist) +{ + u64 total; + + if (aic->seek_samples == 0) { + ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; + ad->new_seek_mean = ad->new_seek_total / 256; + } + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc + */ + if (aic->seek_samples <= 60) /* second&third seek */ + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); + + aic->seek_samples = (7*aic->seek_samples + 256) / 8; + aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; + total = aic->seek_total + (aic->seek_samples/2); + do_div(total, aic->seek_samples); + aic->seek_mean = (sector_t)total; +} + /* * as_update_iohist keeps a decaying histogram of IO thinktimes, and * updates @aic->ttime_mean based on that. It is called when a new * request is queued. */ -static void as_update_iohist(struct as_io_context *aic, struct request *rq) +static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) { struct as_rq *arq = RQ_DATA(rq); int data_dir = arq->is_sync; @@ -779,60 +847,29 @@ static void as_update_iohist(struct as_i return; if (data_dir == REQ_SYNC) { + unsigned long in_flight = atomic_read(&aic->nr_queued) + + atomic_read(&aic->nr_dispatched); spin_lock(&aic->lock); - - if (test_bit(AS_TASK_IORUNNING, &aic->state) - && !atomic_read(&aic->nr_queued) - && !atomic_read(&aic->nr_dispatched)) { + if (test_bit(AS_TASK_IORUNNING, &aic->state) || + test_bit(AS_TASK_IOSTARTED, &aic->state)) { /* Calculate read -> read thinktime */ - thinktime = jiffies - aic->last_end_request; - thinktime = min(thinktime, MAX_THINKTIME-1); - /* fixed point: 1.0 == 1<<8 */ - aic->ttime_samples += 256; - aic->ttime_total += 256*thinktime; - if (aic->ttime_samples) - /* fixed point factor is cancelled here */ - aic->ttime_mean = (aic->ttime_total + 128) - / aic->ttime_samples; - aic->ttime_samples = (aic->ttime_samples>>1) - + (aic->ttime_samples>>2); - aic->ttime_total = (aic->ttime_total>>1) - + (aic->ttime_total>>2); - } - - /* Calculate read -> read seek distance */ - if (!aic->seek_samples) - seek_dist = 0; - else if (aic->last_request_pos < rq->sector) - seek_dist = rq->sector - aic->last_request_pos; - else - seek_dist = aic->last_request_pos - rq->sector; - + if (test_bit(AS_TASK_IORUNNING, &aic->state) + && in_flight == 0) { + thinktime = jiffies - aic->last_end_request; + thinktime = min(thinktime, MAX_THINKTIME-1); + } else + thinktime = 0; + as_update_thinktime(ad, aic, thinktime); + + /* Calculate read -> read seek distance */ + if (aic->last_request_pos < rq->sector) + seek_dist = rq->sector - aic->last_request_pos; + else + seek_dist = aic->last_request_pos - rq->sector; + as_update_seekdist(ad, aic, seek_dist); + } aic->last_request_pos = rq->sector + rq->nr_sectors; - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (aic->seek_samples < 400) /* second&third seek */ - seek_dist = min(seek_dist, (aic->seek_mean * 4) - + 2*1024*1024); - else - seek_dist = min(seek_dist, (aic->seek_mean * 4) - + 2*1024*64); - - aic->seek_samples += 256; - aic->seek_total += (u64)256*seek_dist; - if (aic->seek_samples) { - u64 total = aic->seek_total + (aic->seek_samples>>1); - do_div(total, aic->seek_samples); - aic->seek_mean = (sector_t)total; - } - aic->seek_samples = (aic->seek_samples>>1) - + (aic->seek_samples>>2); - aic->seek_total = (aic->seek_total>>1) - + (aic->seek_total>>2); - + set_bit(AS_TASK_IOSTARTED, &aic->state); spin_unlock(&aic->lock); } } @@ -1376,8 +1413,8 @@ static void as_add_request(struct as_dat arq->io_context = as_get_io_context(); if (arq->io_context) { + as_update_iohist(ad, arq->io_context->aic, arq->request); atomic_inc(&arq->io_context->aic->nr_queued); - as_update_iohist(arq->io_context->aic, arq->request); } alias = as_add_arq_rb(ad, arq); @@ -1885,6 +1922,17 @@ as_var_store(unsigned long *var, const c return count; } +static ssize_t as_est_show(struct as_data *ad, char *page) +{ + int pos = 0; + + pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256); + pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); + pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean); + + return pos; +} + #define SHOW_FUNCTION(__FUNC, __VAR) \ static ssize_t __FUNC(struct as_data *ad, char *page) \ { \ @@ -1916,6 +1964,10 @@ STORE_FUNCTION(as_write_batchexpire_stor &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); #undef STORE_FUNCTION +static struct as_fs_entry as_est_entry = { + .attr = {.name = "est_time", .mode = S_IRUGO }, + .show = as_est_show, +}; static struct as_fs_entry as_readexpire_entry = { .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, .show = as_readexpire_show, @@ -1943,6 +1995,7 @@ static struct as_fs_entry as_write_batch }; static struct attribute *default_attrs[] = { + &as_est_entry.attr, &as_readexpire_entry.attr, &as_writeexpire_entry.attr, &as_anticexpire_entry.attr, _