From: Nick Piggin <piggin@cyberone.com.au>

This really should get into 2.6 sometime. Not only because people want
to have fun switching them around, but because different queues have
different needs: flash devices, nbd may want noop, a database raid
deadline, AS for your system disk.

Anyway, q->elevator is now dynamically allocated. Allocation and freeing
are handled by elevator.c. kobject / sysfs stuff is also handled there.

A side effect is it introduces blk_wait_free_list which can be used for
drivers to wait for the queue to become empty. I think this was needed for
safe disk hotplug.



 Documentation/block/biodoc.txt   |   17 +++
 drivers/block/as-iosched.c       |  125 ++++++++++++++------------
 drivers/block/cfq-iosched.c      |   78 ++++++++--------
 drivers/block/deadline-iosched.c |   53 ++++++-----
 drivers/block/elevator.c         |   91 +++++++++++++-----
 drivers/block/ll_rw_blk.c        |  187 +++++++++++++++++++++++++++++++++------
 include/linux/blkdev.h           |   21 +++-
 include/linux/elevator.h         |   10 +-
 8 files changed, 413 insertions(+), 169 deletions(-)

diff -puN Documentation/block/biodoc.txt~elv-select Documentation/block/biodoc.txt
--- 25/Documentation/block/biodoc.txt~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/Documentation/block/biodoc.txt	2003-10-17 19:07:35.000000000 -0700
@@ -969,10 +969,23 @@ elevator_set_req_fn
 elevator_put_req_fn		Must be used to allocate and free any elevator
 				specific storate for a request.
 
-elevator_init_fn
-elevator_exit_fn		Allocate and free any elevator specific storage
+elevator_alloc_fn
+elevator_release_fn		Allocate and free any elevator specific storage
 				for a queue.
 
+elevator_init_fn
+elevator_exit_fn		Initialise and shutdown and elevator with an
+				associated queue. init must not fail - failing
+				routines must be performed in elevator_alloc.
+				Queue will be empty before exit is called and
+				no future requests will be inserted.
+
+4.1a Calling order for startup and shutdown functions.
+elevator_alloc_fn
+elevator_init_fn
+elevator_exit_fn
+elevator_release_fn
+
 4.2 I/O scheduler implementation
 The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
 optimal disk scan and request servicing performance (based on generic
diff -puN drivers/block/as-iosched.c~elv-select drivers/block/as-iosched.c
--- 25/drivers/block/as-iosched.c~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/drivers/block/as-iosched.c	2003-10-17 19:07:35.000000000 -0700
@@ -606,7 +606,7 @@ static void as_antic_stop(struct as_data
 static void as_antic_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -906,7 +906,7 @@ void update_write_batch(struct as_data *
  */
 static void as_completed_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 	struct as_io_context *aic;
 
@@ -979,7 +979,7 @@ static void as_remove_queued_request(req
 {
 	struct as_rq *arq = RQ_DATA(rq);
 	const int data_dir = arq->is_sync;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1305,7 +1305,7 @@ fifo_expired:
 
 static struct request *as_next_request(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct request *rq = NULL;
 
 	/*
@@ -1412,7 +1412,7 @@ static void as_add_request(struct as_dat
  */
 static void as_requeue_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (arq) {
@@ -1434,7 +1434,7 @@ static void as_requeue_request(request_q
 static void
 as_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	/* barriers must flush the reorder queue */
@@ -1475,7 +1475,7 @@ as_insert_request(request_queue_t *q, st
  */
 static int as_queue_empty(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	if (!list_empty(&ad->fifo_list[REQ_ASYNC])
 		|| !list_empty(&ad->fifo_list[REQ_SYNC])
@@ -1514,7 +1514,7 @@ as_latter_request(request_queue_t *q, st
 static int
 as_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
 	struct request *__rq;
 	int ret;
@@ -1569,7 +1569,7 @@ out_insert:
 
 static void as_merged_request(request_queue_t *q, struct request *req)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 
 	/*
@@ -1614,7 +1614,7 @@ static void
 as_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 	struct as_rq *anext = RQ_DATA(next);
 
@@ -1700,7 +1700,7 @@ static void as_work_handler(void *data)
 
 static void as_put_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (!arq) {
@@ -1714,7 +1714,7 @@ static void as_put_request(request_queue
 
 static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
 
 	if (arq) {
@@ -1735,7 +1735,7 @@ static int as_set_request(request_queue_
 static int as_may_queue(request_queue_t *q, int rw)
 {
 	int ret = 0;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
@@ -1748,54 +1748,18 @@ static int as_may_queue(request_queue_t 
 	return ret;
 }
 
-static void as_exit(request_queue_t *q, elevator_t *e)
-{
-	struct as_data *ad = e->elevator_data;
-
-	del_timer_sync(&ad->antic_timer);
-	kblockd_flush();
-
-	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
-	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
-
-	mempool_destroy(ad->arq_pool);
-	put_io_context(ad->io_context);
-	kfree(ad->hash);
-	kfree(ad);
-}
-
 /*
  * initialize elevator private data (as_data), and alloc a arq for
  * each request on the free lists
  */
-static int as_init(request_queue_t *q, elevator_t *e)
+static void as_init(request_queue_t *q, elevator_t *e)
 {
-	struct as_data *ad;
+	struct as_data *ad = e->elevator_data;
 	int i;
 
-	if (!arq_pool)
-		return -ENOMEM;
-
-	ad = kmalloc(sizeof(*ad), GFP_KERNEL);
-	if (!ad)
-		return -ENOMEM;
-	memset(ad, 0, sizeof(*ad));
-
+	q->elevator = e;
 	ad->q = q; /* Identify what queue the data belongs to */
 
-	ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL);
-	if (!ad->hash) {
-		kfree(ad);
-		return -ENOMEM;
-	}
-
-	ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool);
-	if (!ad->arq_pool) {
-		kfree(ad->hash);
-		kfree(ad);
-		return -ENOMEM;
-	}
-
 	/* anticipatory scheduling helpers */
 	ad->antic_timer.function = as_antic_timeout;
 	ad->antic_timer.data = (unsigned long)q;
@@ -1815,7 +1779,6 @@ static int as_init(request_queue_t *q, e
 	ad->antic_expire = default_antic_expire;
 	ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
 	ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
-	e->elevator_data = ad;
 
 	ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
 	ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
@@ -1823,8 +1786,58 @@ static int as_init(request_queue_t *q, e
 		ad->write_batch_count = 2;
 
 	ad->new_success = 512;
+}
 
-	return 0;
+static void as_exit(request_queue_t *q, elevator_t *e)
+{
+	struct as_data *ad = e->elevator_data;
+
+	BUG_ON(!as_queue_empty(ad->q));
+	put_io_context(ad->io_context);
+}
+
+static int as_alloc(elevator_t *e)
+{
+	struct as_data *ad;
+
+	if (!arq_pool)
+		return -ENOMEM;
+
+	ad = kmalloc(sizeof(*ad), GFP_KERNEL);
+	if (!ad)
+		return -ENOMEM;
+	memset(ad, 0, sizeof(*ad));
+
+	ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL);
+	if (!ad->hash) {
+		kfree(ad);
+		return -ENOMEM;
+	}
+
+	ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
+						mempool_free_slab, arq_pool);
+	if (!ad->arq_pool) {
+		kfree(ad->hash);
+		kfree(ad);
+		return -ENOMEM;
+	}
+
+	e->elevator_data = ad;
+
+ 	return 0;
+}
+
+static void as_release(elevator_t *e)
+{
+	struct as_data *ad = e->elevator_data;
+
+	del_timer_sync(&ad->antic_timer);
+	kblockd_flush();
+
+	mempool_destroy(ad->arq_pool);
+	kfree(ad->hash);
+
+	kfree(ad);
 }
 
 /*
@@ -1925,7 +1938,7 @@ static struct attribute *default_attrs[]
 	NULL,
 };
 
-#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
+#define to_as(ATR) container_of((ATR), struct as_fs_entry, ATR)
 
 static ssize_t
 as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
@@ -1992,6 +2005,8 @@ elevator_t iosched_as = {
 	.elevator_may_queue_fn =	as_may_queue,
 	.elevator_init_fn =		as_init,
 	.elevator_exit_fn =		as_exit,
+	.elevator_alloc_fn =		as_alloc,
+	.elevator_release_fn =		as_release,
 
 	.elevator_ktype =		&as_ktype,
 	.elevator_name =		"anticipatory",
diff -puN drivers/block/cfq-iosched.c~elv-select drivers/block/cfq-iosched.c
--- 25/drivers/block/cfq-iosched.c~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/drivers/block/cfq-iosched.c	2003-10-17 19:07:35.000000000 -0700
@@ -238,7 +238,7 @@ out:
 
 static void cfq_remove_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
@@ -259,7 +259,7 @@ static void cfq_remove_request(request_q
 static int
 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -297,7 +297,7 @@ out_insert:
 
 static void cfq_merged_request(request_queue_t *q, struct request *req)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(req);
 
 	cfq_del_crq_hash(crq);
@@ -393,7 +393,7 @@ restart:
 
 static struct request *cfq_next_request(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *rq;
 
 	if (!list_empty(cfqd->dispatch)) {
@@ -483,7 +483,7 @@ static void cfq_enqueue(struct cfq_data 
 static void
 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	switch (where) {
@@ -514,7 +514,7 @@ cfq_insert_request(request_queue_t *q, s
 
 static int cfq_queue_empty(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 
 	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
 		return 1;
@@ -548,7 +548,7 @@ cfq_latter_request(request_queue_t *q, s
 
 static int cfq_may_queue(request_queue_t *q, int rw)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	int ret = 1;
 
@@ -573,7 +573,7 @@ out:
 
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
@@ -587,7 +587,7 @@ static void cfq_put_request(request_queu
 
 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 
 	if (crq) {
@@ -602,28 +602,14 @@ static int cfq_set_request(request_queue
 	return 1;
 }
 
-static void cfq_exit(request_queue_t *q, elevator_t *e)
-{
-	struct cfq_data *cfqd = e->elevator_data;
-
-	e->elevator_data = NULL;
-	mempool_destroy(cfqd->crq_pool);
-	kfree(cfqd->crq_hash);
-	kfree(cfqd->cfq_hash);
-	kfree(cfqd);
-}
-
-static int cfq_init(request_queue_t *q, elevator_t *e)
+static int cfq_alloc(elevator_t *e)
 {
 	struct cfq_data *cfqd;
-	int i;
 
 	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
 	if (!cfqd)
 		return -ENOMEM;
-
 	memset(cfqd, 0, sizeof(*cfqd));
-	INIT_LIST_HEAD(&cfqd->rr_list);
 
 	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
@@ -637,13 +623,43 @@ static int cfq_init(request_queue_t *q, 
 	if (!cfqd->crq_pool)
 		goto out_crqpool;
 
+	e->elevator_data = cfqd;
+
+	return 0;
+out_crqpool:
+	kfree(cfqd->cfq_hash);
+out_cfqhash:
+	kfree(cfqd->crq_hash);
+out_crqhash:
+	kfree(cfqd);
+	return -ENOMEM;
+
+}
+
+static void cfq_release(elevator_t *e)
+{
+	struct cfq_data *cfqd = e->elevator_data;
+
+	e->elevator_data = NULL;
+	mempool_destroy(cfqd->crq_pool);
+	kfree(cfqd->crq_hash);
+	kfree(cfqd->cfq_hash);
+	kfree(cfqd);
+}
+
+static void cfq_init(request_queue_t *q, elevator_t *e)
+{
+	struct cfq_data *cfqd = e->elevator_data;
+	int i;
+
+	INIT_LIST_HEAD(&cfqd->rr_list);
+
 	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
 		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
 	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
 		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
 
 	cfqd->dispatch = &q->queue_head;
-	e->elevator_data = cfqd;
 
 	/*
 	 * just set it to some high value, we want anyone to be able to queue
@@ -651,15 +667,6 @@ static int cfq_init(request_queue_t *q, 
 	 */
 	cfqd->max_queued = q->nr_requests;
 	q->nr_requests = 8192;
-
-	return 0;
-out_crqpool:
-	kfree(cfqd->cfq_hash);
-out_cfqhash:
-	kfree(cfqd->crq_hash);
-out_crqhash:
-	kfree(cfqd);
-	return -ENOMEM;
 }
 
 static int __init cfq_slab_setup(void)
@@ -701,7 +708,8 @@ elevator_t iosched_cfq = {
 	.elevator_put_req_fn =		cfq_put_request,
 	.elevator_may_queue_fn =	cfq_may_queue,
 	.elevator_init_fn =		cfq_init,
-	.elevator_exit_fn =		cfq_exit,
+	.elevator_alloc_fn =		cfq_alloc,
+	.elevator_release_fn =		cfq_release,
 };
 
 EXPORT_SYMBOL(iosched_cfq);
diff -puN drivers/block/deadline-iosched.c~elv-select drivers/block/deadline-iosched.c
--- 25/drivers/block/deadline-iosched.c~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/drivers/block/deadline-iosched.c	2003-10-17 19:07:35.000000000 -0700
@@ -289,7 +289,7 @@ deadline_find_first_drq(struct deadline_
 static inline void
 deadline_add_request(struct request_queue *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	const int data_dir = rq_data_dir(drq->request);
@@ -317,7 +317,7 @@ static void deadline_remove_request(requ
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
-		struct deadline_data *dd = q->elevator.elevator_data;
+		struct deadline_data *dd = q->elevator->elevator_data;
 
 		list_del_init(&drq->fifo);
 		deadline_remove_merge_hints(q, drq);
@@ -328,7 +328,7 @@ static void deadline_remove_request(requ
 static int
 deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -383,7 +383,7 @@ out_insert:
 
 static void deadline_merged_request(request_queue_t *q, struct request *req)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 
 	/*
@@ -407,7 +407,7 @@ static void
 deadline_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 	struct deadline_rq *dnext = RQ_DATA(next);
 
@@ -604,7 +604,7 @@ dispatch_request:
 
 static struct request *deadline_next_request(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *rq;
 
 	/*
@@ -625,7 +625,7 @@ dispatch:
 static void
 deadline_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	/* barriers must flush the reorder queue */
 	if (unlikely(rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)
@@ -653,7 +653,7 @@ deadline_insert_request(request_queue_t 
 
 static int deadline_queue_empty(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	if (!list_empty(&dd->fifo_list[WRITE])
 	    || !list_empty(&dd->fifo_list[READ])
@@ -687,7 +687,7 @@ deadline_latter_request(request_queue_t 
 	return NULL;
 }
 
-static void deadline_exit(request_queue_t *q, elevator_t *e)
+static void deadline_release(elevator_t *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -699,14 +699,9 @@ static void deadline_exit(request_queue_
 	kfree(dd);
 }
 
-/*
- * initialize elevator private data (deadline_data), and alloc a drq for
- * each request on the free lists
- */
-static int deadline_init(request_queue_t *q, elevator_t *e)
+static int deadline_alloc(elevator_t *e)
 {
 	struct deadline_data *dd;
-	int i;
 
 	if (!drq_pool)
 		return -ENOMEM;
@@ -722,13 +717,30 @@ static int deadline_init(request_queue_t
 		return -ENOMEM;
 	}
 
-	dd->drq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, drq_pool);
+	dd->drq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
+						mempool_free_slab, drq_pool);
 	if (!dd->drq_pool) {
 		kfree(dd->hash);
 		kfree(dd);
 		return -ENOMEM;
 	}
 
+	e->elevator_data = dd;
+
+	return 0;
+}
+
+/*
+ * initialize elevator private data (deadline_data), and alloc a drq for
+ * each request on the free lists
+ */
+static void deadline_init(request_queue_t *q, elevator_t *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+	int i;
+
+	q->elevator = e;
+
 	for (i = 0; i < DL_HASH_ENTRIES; i++)
 		INIT_LIST_HEAD(&dd->hash[i]);
 
@@ -742,13 +754,11 @@ static int deadline_init(request_queue_t
 	dd->writes_starved = writes_starved;
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
-	e->elevator_data = dd;
-	return 0;
 }
 
 static void deadline_put_request(request_queue_t *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
@@ -760,7 +770,7 @@ static void deadline_put_request(request
 static int
 deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
 
 	drq = mempool_alloc(dd->drq_pool, gfp_mask);
@@ -931,7 +941,8 @@ elevator_t iosched_deadline = {
 	.elevator_set_req_fn =		deadline_set_request,
 	.elevator_put_req_fn = 		deadline_put_request,
 	.elevator_init_fn =		deadline_init,
-	.elevator_exit_fn =		deadline_exit,
+	.elevator_alloc_fn =		deadline_alloc,
+	.elevator_release_fn =		deadline_release,
 
 	.elevator_ktype =		&deadline_ktype,
 	.elevator_name =		"deadline",
diff -puN drivers/block/elevator.c~elv-select drivers/block/elevator.c
--- 25/drivers/block/elevator.c~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/drivers/block/elevator.c	2003-10-17 19:07:35.000000000 -0700
@@ -89,29 +89,60 @@ inline int elv_try_last_merge(request_qu
 /*
  * general block -> elevator interface starts here
  */
-int elevator_init(request_queue_t *q, elevator_t *type)
+void elevator_init(request_queue_t *q, elevator_t *e)
 {
-	elevator_t *e = &q->elevator;
-
-	memcpy(e, type, sizeof(*e));
+	q->elevator = e;
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
 
 	if (e->elevator_init_fn)
-		return e->elevator_init_fn(q, e);
-
-	return 0;
+		e->elevator_init_fn(q, e);
 }
 
 void elevator_exit(request_queue_t *q)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
+
+	BUG_ON(q->rq.count[READ] || q->rq.count[WRITE]);
 
 	if (e->elevator_exit_fn)
 		e->elevator_exit_fn(q, e);
 }
 
+elevator_t *elevator_alloc(elevator_t *type)
+{
+	elevator_t *e = kmalloc(sizeof(*type), GFP_KERNEL);
+
+	if (e == NULL)
+		goto out_err;
+
+	memcpy(e, type, sizeof(*e));
+
+	if (e->elevator_alloc_fn)
+		if (e->elevator_alloc_fn(e))
+			goto out_alloc;
+
+	return e;
+
+out_alloc:
+	kfree(e);
+out_err:
+	return NULL;
+}
+
+void elevator_release(struct kobject *kobj)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+
+	printk(KERN_INFO "releasing %s io scheduler\n", e->elevator_name);
+
+	if (e->elevator_release_fn)
+		e->elevator_release_fn(e);
+
+	kfree(e);
+}
+
 int elevator_global_init(void)
 {
 	return 0;
@@ -119,7 +150,7 @@ int elevator_global_init(void)
 
 int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_merge_fn)
 		return e->elevator_merge_fn(q, req, bio);
@@ -129,7 +160,7 @@ int elv_merge(request_queue_t *q, struct
 
 void elv_merged_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_merged_fn)
 		e->elevator_merged_fn(q, rq);
@@ -138,7 +169,7 @@ void elv_merged_request(request_queue_t 
 void elv_merge_requests(request_queue_t *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (q->last_merge == next)
 		q->last_merge = NULL;
@@ -153,8 +184,8 @@ void elv_requeue_request(request_queue_t
 	 * if iosched has an explicit requeue hook, then use that. otherwise
 	 * just put the request at the front of the queue
 	 */
-	if (q->elevator.elevator_requeue_req_fn)
-		q->elevator.elevator_requeue_req_fn(q, rq);
+	if (q->elevator->elevator_requeue_req_fn)
+		q->elevator->elevator_requeue_req_fn(q, rq);
 	else
 		__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
@@ -165,7 +196,7 @@ void __elv_add_request(request_queue_t *
 	if (plug)
 		blk_plug_device(q);
 
-	q->elevator.elevator_add_req_fn(q, rq, where);
+	q->elevator->elevator_add_req_fn(q, rq, where);
 }
 
 void elv_add_request(request_queue_t *q, struct request *rq, int where,
@@ -180,7 +211,7 @@ void elv_add_request(request_queue_t *q,
 
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
-	return q->elevator.elevator_next_req_fn(q);
+	return q->elevator->elevator_next_req_fn(q);
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -225,7 +256,7 @@ struct request *elv_next_request(request
 
 void elv_remove_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	/*
 	 * the main clearing point for q->last_merge is on retrieval of
@@ -243,7 +274,7 @@ void elv_remove_request(request_queue_t 
 
 int elv_queue_empty(request_queue_t *q)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_queue_empty_fn)
 		return e->elevator_queue_empty_fn(q);
@@ -255,7 +286,7 @@ struct request *elv_latter_request(reque
 {
 	struct list_head *next;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_latter_req_fn)
 		return e->elevator_latter_req_fn(q, rq);
@@ -271,7 +302,7 @@ struct request *elv_former_request(reque
 {
 	struct list_head *prev;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_former_req_fn)
 		return e->elevator_former_req_fn(q, rq);
@@ -285,7 +316,7 @@ struct request *elv_former_request(reque
 
 int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_set_req_fn)
 		return e->elevator_set_req_fn(q, rq, gfp_mask);
@@ -296,7 +327,7 @@ int elv_set_request(request_queue_t *q, 
 
 void elv_put_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_put_req_fn)
 		e->elevator_put_req_fn(q, rq);
@@ -304,7 +335,7 @@ void elv_put_request(request_queue_t *q,
 
 int elv_may_queue(request_queue_t *q, int rw)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_may_queue_fn)
 		return e->elevator_may_queue_fn(q, rw);
@@ -314,24 +345,32 @@ int elv_may_queue(request_queue_t *q, in
 
 void elv_completed_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (e->elevator_completed_req_fn)
 		e->elevator_completed_req_fn(q, rq);
 }
 
+static struct kobj_type default_ktype = {
+	.release = &elevator_release,
+};
+
 int elv_register_queue(struct request_queue *q)
 {
 	elevator_t *e;
 
-	e = &q->elevator;
+	e = q->elevator;
 
 	e->kobj.parent = kobject_get(&q->kobj);
 	if (!e->kobj.parent)
 		return -EBUSY;
 
 	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_ktype;
+	if (e->elevator_ktype) {
+		e->elevator_ktype->release = &elevator_release;
+		e->kobj.ktype = e->elevator_ktype;
+	} else
+		e->kobj.ktype = &default_ktype;
 
 	return kobject_register(&e->kobj);
 }
@@ -339,7 +378,7 @@ int elv_register_queue(struct request_qu
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q) {
-		elevator_t * e = &q->elevator;
+		elevator_t *e = q->elevator;
 		kobject_unregister(&e->kobj);
 		kobject_put(&q->kobj);
 	}
diff -puN drivers/block/ll_rw_blk.c~elv-select drivers/block/ll_rw_blk.c
--- 25/drivers/block/ll_rw_blk.c~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/drivers/block/ll_rw_blk.c	2003-10-17 19:07:35.000000000 -0700
@@ -1259,6 +1259,45 @@ out:
 EXPORT_SYMBOL(blk_run_queues);
 
 /**
+ * blk_wait_free_list
+ * @q: the request queue to wait on
+ *
+ * Description:
+ *   Synchronously wait until all requests have been emptied out of the queue.
+ *   Must be called with the queue marked QUEUE_FLAG_DEAD, or
+ *   blk_set_queue_drain.
+ **/
+static void blk_wait_free_list(request_queue_t *q)
+{
+	DEFINE_WAIT(wait);
+	struct request_list *rl = &q->rq;
+
+	if (!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)
+		&& !blk_queue_drain(q)) {
+		WARN_ON(1);
+		/* It might be racy to set this here. Caller should be fixed */
+		set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+	}
+
+	prepare_to_wait(&rl->empty, &wait, TASK_UNINTERRUPTIBLE);
+
+	if (rl->count[READ] || rl->count[WRITE]
+		|| waitqueue_active(&rl->wait[READ])
+		|| waitqueue_active(&rl->wait[WRITE]) ) {
+
+		spin_unlock_irq(q->queue_lock);
+		wake_up_all(&q->rq.wait[READ]);
+		wake_up_all(&q->rq.wait[WRITE]);
+		io_schedule();
+		spin_lock_irq(q->queue_lock);
+	}
+
+	finish_wait(&rl->empty, &wait);
+
+	WARN_ON(rl->count[READ] || rl->count[WRITE]);
+}
+
+/**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
  * @q:    the request queue to be released
  *
@@ -1280,7 +1319,8 @@ void blk_cleanup_queue(request_queue_t *
 	if (!atomic_dec_and_test(&q->refcnt))
 		return;
 
-	elevator_exit(q);
+	if (q->elevator)
+		elevator_exit(q);
 
 	del_timer_sync(&q->unplug_timer);
 	kblockd_flush();
@@ -1303,8 +1343,10 @@ static int blk_init_free_list(request_qu
 	rl->count[READ] = rl->count[WRITE] = 0;
 	init_waitqueue_head(&rl->wait[READ]);
 	init_waitqueue_head(&rl->wait[WRITE]);
+	init_waitqueue_head(&rl->empty);
 
-	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
+	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
+					mempool_free_slab, request_cachep);
 
 	if (!rl->rq_pool)
 		return -ENOMEM;
@@ -1328,30 +1370,38 @@ static elevator_t *chosen_elevator =
 #error "You must have at least 1 I/O scheduler selected"
 #endif
 
-#if defined(CONFIG_IOSCHED_AS) || defined(CONFIG_IOSCHED_DEADLINE) || defined (CONFIG_IOSCHED_NOOP)
-static int __init elevator_setup(char *str)
+elevator_t *str_to_elv(const char *str)
 {
 #ifdef CONFIG_IOSCHED_DEADLINE
-	if (!strcmp(str, "deadline"))
-		chosen_elevator = &iosched_deadline;
+	if (!strncmp(str, "deadline", strlen("deadline")))
+		return &iosched_deadline;
 #endif
 #ifdef CONFIG_IOSCHED_AS
-	if (!strcmp(str, "as"))
-		chosen_elevator = &iosched_as;
+	if (!strncmp(str, "as", strlen("as")))
+		return &iosched_as;
 #endif
 #ifdef CONFIG_IOSCHED_CFQ
-	if (!strcmp(str, "cfq"))
-		chosen_elevator = &iosched_cfq;
+	if (!strncmp(str, "cfq", strlen("cfq")))
+		return &iosched_cfq;
 #endif
 #ifdef CONFIG_IOSCHED_NOOP
-	if (!strcmp(str, "noop"))
-		chosen_elevator = &elevator_noop;
+	if (!strncmp(str, "noop", strlen("noop")))
+		return &elevator_noop;
 #endif
+
+	return NULL;
+}
+
+static int __init elevator_setup(char *str)
+{
+	elevator_t *e = str_to_elv(str);
+	if (e != NULL)
+		chosen_elevator = e;
+
 	return 1;
 }
 
 __setup("elevator=", elevator_setup);
-#endif /* CONFIG_IOSCHED_AS || CONFIG_IOSCHED_DEADLINE || CONFIG_IOSCHED_NOOP */
 
 request_queue_t *blk_alloc_queue(int gfp_mask)
 {
@@ -1402,7 +1452,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	request_queue_t *q;
-	static int printed;
+	elevator_t *e;
 
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
@@ -1411,14 +1461,12 @@ request_queue_t *blk_init_queue(request_
 	if (blk_init_free_list(q))
 		goto out_init;
 
-	if (!printed) {
-		printed = 1;
-		printk("Using %s io scheduler\n", chosen_elevator->elevator_name);
-	}
-
-	if (elevator_init(q, chosen_elevator))
+	e = elevator_alloc(chosen_elevator);
+	if (!e)
 		goto out_elv;
 
+	elevator_init(q, e);
+
 	q->request_fn		= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
 	q->front_merge_fn      	= ll_front_merge_fn;
@@ -1530,6 +1578,10 @@ static void freed_request(request_queue_
 		if (!waitqueue_active(&rl->wait[rw]))
 			blk_clear_queue_full(q, rw);
 	}
+	if ( unlikely(waitqueue_active(&rl->empty)) ) {
+		if (rl->count[READ] == 0 && rl->count[WRITE] == 0)
+			wake_up_all(&rl->empty);
+	}
 }
 
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1543,6 +1595,11 @@ static struct request *get_request(reque
 	struct io_context *ioc = get_io_context(gfp_mask);
 
 	spin_lock_irq(q->queue_lock);
+	if (blk_queue_drain(q)) {
+		spin_unlock_irq(q->queue_lock);
+		goto out;
+	}
+
 	if (rl->count[rw]+1 >= q->nr_requests) {
 		/*
 		 * The queue will fill after this allocation, so set it as
@@ -2796,6 +2853,79 @@ queue_var_store(unsigned long *var, cons
 	return count;
 }
 
+static ssize_t queue_elevator_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%s\n", q->elevator->elevator_name);
+}
+
+static ssize_t
+queue_elevator_store(struct request_queue *q, const char *page, size_t count)
+{
+	elevator_t *type, *elv;
+	unsigned long flags;
+	static DECLARE_MUTEX(switch_mutex);
+
+	down(&switch_mutex);
+
+	type = str_to_elv(page);
+	if (type == NULL) {
+		goto out;
+	}
+
+	elv = elevator_alloc(type);
+	if (!elv) {
+		goto out;
+	}
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/* Wait for the request list to empty */
+	blk_set_queue_drain(q);
+	blk_wait_free_list(q);
+
+	/* Stop old elevator */
+	elevator_exit(q);
+
+	/* Unlock here should be OK. The elevator should not be entered because
+	 * the queue is drained, and blocked... */
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	elv_unregister_queue(q);
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/* Start new one */
+	elevator_init(q, elv);
+	printk(KERN_INFO "elevator_init %s\n", q->elevator->elevator_name);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (elv_register_queue(q)) {
+		/*
+		 * Can't do much about it now... failure should not cause the
+		 * device to stop working or future elevator selection to stop
+		 * working though.
+		 */
+		printk(KERN_INFO "elv_register_queue failed\n");
+		WARN_ON(1);
+	}
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/* Unblock the request list and wake waiters */
+	blk_clear_queue_drain(q);
+	wake_up_all(&q->rq.wait[READ]);
+	wake_up_all(&q->rq.wait[WRITE]);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	blk_run_queue(q);
+out:
+	up(&switch_mutex);
+	return count;
+}
+
+static struct queue_sysfs_entry queue_elevator_entry = {
+	.attr = {.name = "io_scheduler", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_elevator_show,
+	.store = queue_elevator_store,
+};
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
@@ -2844,6 +2974,7 @@ static struct queue_sysfs_entry queue_re
 
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
+	&queue_elevator_entry.attr,
 	NULL,
 };
 
@@ -2900,16 +3031,19 @@ int blk_register_queue(struct gendisk *d
 		return -EBUSY;
 
 	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
-	q->kobj.ktype = &queue_ktype;
+	if (q->elevator)
+		q->kobj.ktype = &queue_ktype;
 
 	ret = kobject_register(&q->kobj);
 	if (ret < 0)
 		return ret;
 
-	ret = elv_register_queue(q);
-	if (ret) {
-		kobject_unregister(&q->kobj);
-		return ret;
+	if (q->elevator) {
+		ret = elv_register_queue(q);
+		if (ret) {
+			kobject_unregister(&q->kobj);
+			return ret;
+		}
 	}
 
 	return 0;
@@ -2920,7 +3054,8 @@ void blk_unregister_queue(struct gendisk
 	request_queue_t *q = disk->queue;
 
 	if (q) {
-		elv_unregister_queue(q);
+		if (q->elevator)
+			elv_unregister_queue(q);
 
 		kobject_unregister(&q->kobj);
 		kobject_put(&disk->kobj);
diff -puN include/linux/blkdev.h~elv-select include/linux/blkdev.h
--- 25/include/linux/blkdev.h~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/include/linux/blkdev.h	2003-10-17 19:07:35.000000000 -0700
@@ -80,6 +80,7 @@ struct request_list {
 	int count[2];
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
+	wait_queue_head_t empty;
 };
 
 /*
@@ -272,7 +273,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
-	elevator_t		elevator;
+	elevator_t		*elevator;
 
 	/*
 	 * the queue request freelist, one for reads and one for writes
@@ -366,7 +367,8 @@ struct request_queue
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_READFULL	3	/* write queue has been filled */
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
-#define QUEUE_FLAG_DEAD		5	/* queue being torn down */
+#define QUEUE_FLAG_DRAIN	5	/* queue being drained */
+#define QUEUE_FLAG_DEAD		6	/* queue being torn down */
 
 #define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -406,6 +408,21 @@ static inline void blk_clear_queue_full(
 		clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 
+static inline int blk_queue_drain(struct request_queue *q)
+{
+	return test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+}
+
+static inline void blk_set_queue_drain(struct request_queue *q)
+{
+	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+}
+
+static inline void blk_clear_queue_drain(struct request_queue *q)
+{
+	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+}
+
 
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
diff -puN include/linux/elevator.h~elv-select include/linux/elevator.h
--- 25/include/linux/elevator.h~elv-select	2003-10-17 19:07:35.000000000 -0700
+++ 25-akpm/include/linux/elevator.h	2003-10-17 19:07:35.000000000 -0700
@@ -21,8 +21,10 @@ typedef int (elevator_may_queue_fn) (req
 typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 
-typedef int (elevator_init_fn) (request_queue_t *, elevator_t *);
+typedef void (elevator_init_fn) (request_queue_t *, elevator_t *);
 typedef void (elevator_exit_fn) (request_queue_t *, elevator_t *);
+typedef int (elevator_alloc_fn) (elevator_t *);
+typedef void (elevator_release_fn) (elevator_t *);
 
 struct elevator_s
 {
@@ -48,6 +50,8 @@ struct elevator_s
 
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
+	elevator_alloc_fn *elevator_alloc_fn;
+	elevator_release_fn *elevator_release_fn;
 
 	void *elevator_data;
 
@@ -99,8 +103,10 @@ extern elevator_t iosched_as;
  */
 extern elevator_t iosched_cfq;
 
-extern int elevator_init(request_queue_t *, elevator_t *);
+extern void elevator_init(request_queue_t *, elevator_t *);
 extern void elevator_exit(request_queue_t *);
+extern elevator_t *elevator_alloc(elevator_t *);
+extern void elevator_release(struct kobject *);
 extern inline int elv_rq_merge_ok(struct request *, struct bio *);
 extern inline int elv_try_merge(struct request *, struct bio *);
 extern inline int elv_try_last_merge(request_queue_t *, struct bio *);

_