patch-2.4.15 linux/drivers/md/multipath.c
Next file: linux/drivers/media/radio/Config.in
Previous file: linux/drivers/md/lvm.c
Back to the patch index
Back to the overall index
- Lines: 633
- Date:
Mon Nov 12 09:51:56 2001
- Orig file:
v2.4.14/linux/drivers/md/multipath.c
- Orig date:
Tue Oct 9 17:06:51 2001
diff -u --recursive --new-file v2.4.14/linux/drivers/md/multipath.c linux/drivers/md/multipath.c
@@ -7,10 +7,7 @@
*
* MULTIPATH management functions.
*
- * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
- *
- * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
- * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ * derived from raid1.c.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,6 +30,9 @@
#define MAX_WORK_PER_DISK 128
+#define NR_RESERVED_BUFS 32
+
+
/*
* The following can be used to debug the driver
*/
@@ -53,147 +53,55 @@
static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
-struct buffer_head *multipath_alloc_bh(multipath_conf_t *conf, int cnt)
-{
- /* return a linked list of "cnt" struct buffer_heads.
- * don't take any off the free list unless we know we can
- * get all we need, otherwise we could deadlock
- */
- struct buffer_head *bh=NULL;
-
- while(cnt) {
- struct buffer_head *t;
- md_spin_lock_irq(&conf->device_lock);
- if (conf->freebh_cnt >= cnt)
- while (cnt) {
- t = conf->freebh;
- conf->freebh = t->b_next;
- t->b_next = bh;
- bh = t;
- t->b_state = 0;
- conf->freebh_cnt--;
- cnt--;
- }
- md_spin_unlock_irq(&conf->device_lock);
- if (cnt == 0)
- break;
- t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_NOIO);
- if (t) {
- memset(t, 0, sizeof(*t));
- t->b_next = bh;
- bh = t;
- cnt--;
- } else {
- PRINTK("waiting for %d bh\n", cnt);
- wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
- }
- }
- return bh;
-}
-
-static inline void multipath_free_bh(multipath_conf_t *conf, struct buffer_head *bh)
-{
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- while (bh) {
- struct buffer_head *t = bh;
- bh=bh->b_next;
- if (t->b_pprev == NULL)
- kfree(t);
- else {
- t->b_next= conf->freebh;
- conf->freebh = t;
- conf->freebh_cnt++;
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- wake_up(&conf->wait_buffer);
-}
-
-static int multipath_grow_bh(multipath_conf_t *conf, int cnt)
-{
- /* allocate cnt buffer_heads, possibly less if kalloc fails */
- int i = 0;
-
- while (i < cnt) {
- struct buffer_head *bh;
- bh = kmalloc(sizeof(*bh), GFP_KERNEL);
- if (!bh) break;
- memset(bh, 0, sizeof(*bh));
-
- md_spin_lock_irq(&conf->device_lock);
- bh->b_pprev = &conf->freebh;
- bh->b_next = conf->freebh;
- conf->freebh = bh;
- conf->freebh_cnt++;
- md_spin_unlock_irq(&conf->device_lock);
-
- i++;
- }
- return i;
-}
-
-static int multipath_shrink_bh(multipath_conf_t *conf, int cnt)
-{
- /* discard cnt buffer_heads, if we can find them */
- int i = 0;
- md_spin_lock_irq(&conf->device_lock);
- while ((i < cnt) && conf->freebh) {
- struct buffer_head *bh = conf->freebh;
- conf->freebh = bh->b_next;
- kfree(bh);
- i++;
- conf->freebh_cnt--;
- }
- md_spin_unlock_irq(&conf->device_lock);
- return i;
-}
-
static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
{
- struct multipath_bh *r1_bh = NULL;
+ struct multipath_bh *mp_bh = NULL;
do {
md_spin_lock_irq(&conf->device_lock);
- if (conf->freer1) {
- r1_bh = conf->freer1;
- conf->freer1 = r1_bh->next_r1;
- r1_bh->next_r1 = NULL;
- r1_bh->state = 0;
- r1_bh->bh_req.b_state = 0;
+ if (!conf->freer1_blocked && conf->freer1) {
+ mp_bh = conf->freer1;
+ conf->freer1 = mp_bh->next_mp;
+ conf->freer1_cnt--;
+ mp_bh->next_mp = NULL;
+ mp_bh->state = (1 << MPBH_PreAlloc);
+ mp_bh->bh_req.b_state = 0;
}
md_spin_unlock_irq(&conf->device_lock);
- if (r1_bh)
- return r1_bh;
- r1_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
+ if (mp_bh)
+ return mp_bh;
+ mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
GFP_NOIO);
- if (r1_bh) {
- memset(r1_bh, 0, sizeof(*r1_bh));
- return r1_bh;
- }
- wait_event(conf->wait_buffer, conf->freer1);
+ if (mp_bh) {
+ memset(mp_bh, 0, sizeof(*mp_bh));
+ return mp_bh;
+ }
+ conf->freer1_blocked = 1;
+ wait_disk_event(conf->wait_buffer,
+ !conf->freer1_blocked ||
+ conf->freer1_cnt > NR_RESERVED_BUFS/2
+ );
+ conf->freer1_blocked = 0;
} while (1);
}
-static inline void multipath_free_mpbh(struct multipath_bh *r1_bh)
+static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
{
- struct buffer_head *bh = r1_bh->multipath_bh_list;
- multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
-
- r1_bh->multipath_bh_list = NULL;
+ multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
- if (test_bit(MPBH_PreAlloc, &r1_bh->state)) {
+ if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
- r1_bh->next_r1 = conf->freer1;
- conf->freer1 = r1_bh;
+ mp_bh->next_mp = conf->freer1;
+ conf->freer1 = mp_bh;
+ conf->freer1_cnt++;
spin_unlock_irqrestore(&conf->device_lock, flags);
+ wake_up(&conf->wait_buffer);
} else {
- kfree(r1_bh);
+ kfree(mp_bh);
}
- multipath_free_bh(conf, bh);
}
static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
@@ -201,18 +109,15 @@
int i = 0;
while (i < cnt) {
- struct multipath_bh *r1_bh;
- r1_bh = (struct multipath_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
- if (!r1_bh)
+ struct multipath_bh *mp_bh;
+ mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
+ if (!mp_bh)
break;
- memset(r1_bh, 0, sizeof(*r1_bh));
-
- md_spin_lock_irq(&conf->device_lock);
- set_bit(MPBH_PreAlloc, &r1_bh->state);
- r1_bh->next_r1 = conf->freer1;
- conf->freer1 = r1_bh;
- md_spin_unlock_irq(&conf->device_lock);
+ memset(mp_bh, 0, sizeof(*mp_bh));
+ set_bit(MPBH_PreAlloc, &mp_bh->state);
+ mp_bh->mddev = conf->mddev;
+ multipath_free_mpbh(mp_bh);
i++;
}
return i;
@@ -222,29 +127,15 @@
{
md_spin_lock_irq(&conf->device_lock);
while (conf->freer1) {
- struct multipath_bh *r1_bh = conf->freer1;
- conf->freer1 = r1_bh->next_r1;
- kfree(r1_bh);
+ struct multipath_bh *mp_bh = conf->freer1;
+ conf->freer1 = mp_bh->next_mp;
+ conf->freer1_cnt--;
+ kfree(mp_bh);
}
md_spin_unlock_irq(&conf->device_lock);
}
-
-static inline void multipath_free_buf(struct multipath_bh *r1_bh)
-{
- unsigned long flags;
- struct buffer_head *bh = r1_bh->multipath_bh_list;
- multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
- r1_bh->multipath_bh_list = NULL;
-
- spin_lock_irqsave(&conf->device_lock, flags);
- r1_bh->next_r1 = conf->freebuf;
- conf->freebuf = r1_bh;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- multipath_free_bh(conf, bh);
-}
-
static int multipath_map (mddev_t *mddev, kdev_t *rdev)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
@@ -266,77 +157,45 @@
return (-1);
}
-static void multipath_reschedule_retry (struct multipath_bh *r1_bh)
+static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
{
unsigned long flags;
- mddev_t *mddev = r1_bh->mddev;
+ mddev_t *mddev = mp_bh->mddev;
multipath_conf_t *conf = mddev_to_conf(mddev);
md_spin_lock_irqsave(&retry_list_lock, flags);
if (multipath_retry_list == NULL)
multipath_retry_tail = &multipath_retry_list;
- *multipath_retry_tail = r1_bh;
- multipath_retry_tail = &r1_bh->next_r1;
- r1_bh->next_r1 = NULL;
+ *multipath_retry_tail = mp_bh;
+ multipath_retry_tail = &mp_bh->next_mp;
+ mp_bh->next_mp = NULL;
md_spin_unlock_irqrestore(&retry_list_lock, flags);
md_wakeup_thread(conf->thread);
}
-static void inline io_request_done(unsigned long sector, multipath_conf_t *conf, int phase)
-{
- unsigned long flags;
- spin_lock_irqsave(&conf->segment_lock, flags);
- if (sector < conf->start_active)
- conf->cnt_done--;
- else if (sector >= conf->start_future && conf->phase == phase)
- conf->cnt_future--;
- else if (!--conf->cnt_pending)
- wake_up(&conf->wait_ready);
-
- spin_unlock_irqrestore(&conf->segment_lock, flags);
-}
-
-static void inline sync_request_done (unsigned long sector, multipath_conf_t *conf)
-{
- unsigned long flags;
- spin_lock_irqsave(&conf->segment_lock, flags);
- if (sector >= conf->start_ready)
- --conf->cnt_ready;
- else if (sector >= conf->start_active) {
- if (!--conf->cnt_active) {
- conf->start_active = conf->start_ready;
- wake_up(&conf->wait_done);
- }
- }
- spin_unlock_irqrestore(&conf->segment_lock, flags);
-}
-
/*
* multipath_end_bh_io() is called when we have finished servicing a multipathed
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
-static void multipath_end_bh_io (struct multipath_bh *r1_bh, int uptodate)
+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
{
- struct buffer_head *bh = r1_bh->master_bh;
-
- io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
- test_bit(MPBH_SyncPhase, &r1_bh->state));
+ struct buffer_head *bh = mp_bh->master_bh;
bh->b_end_io(bh, uptodate);
- multipath_free_mpbh(r1_bh);
+ multipath_free_mpbh(mp_bh);
}
void multipath_end_request (struct buffer_head *bh, int uptodate)
{
- struct multipath_bh * r1_bh = (struct multipath_bh *)(bh->b_private);
+ struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
/*
* this branch is our 'one multipath IO has finished' event handler:
*/
if (!uptodate)
- md_error (r1_bh->mddev, bh->b_dev);
+ md_error (mp_bh->mddev, bh->b_dev);
else
/*
* Set MPBH_Uptodate in our master buffer_head, so that
@@ -347,11 +206,11 @@
* user-side. So if something waits for IO, then it will
* wait for the 'master' buffer_head.
*/
- set_bit (MPBH_Uptodate, &r1_bh->state);
+ set_bit (MPBH_Uptodate, &mp_bh->state);
if (uptodate) {
- multipath_end_bh_io(r1_bh, uptodate);
+ multipath_end_bh_io(mp_bh, uptodate);
return;
}
/*
@@ -359,20 +218,13 @@
*/
printk(KERN_ERR "multipath: %s: rescheduling block %lu\n",
partition_name(bh->b_dev), bh->b_blocknr);
- multipath_reschedule_retry(r1_bh);
+ multipath_reschedule_retry(mp_bh);
return;
}
/*
* This routine returns the disk from which the requested read should
- * be done. It bookkeeps the last read position for every disk
- * in array and when new read requests come, the disk which last
- * position is nearest to the request, is chosen.
- *
- * TODO: now if there are 2 multipaths in the same 2 devices, performance
- * degrades dramatically because position is multipath, not device based.
- * This should be changed to be device based. Also atomic sequential
- * reads should be somehow balanced.
+ * be done.
*/
static int multipath_read_balance (multipath_conf_t *conf)
@@ -391,7 +243,7 @@
{
multipath_conf_t *conf = mddev_to_conf(mddev);
struct buffer_head *bh_req;
- struct multipath_bh * r1_bh;
+ struct multipath_bh * mp_bh;
struct multipath_info *multipath;
if (!buffer_locked(bh))
@@ -406,45 +258,25 @@
if (rw == READA)
rw = READ;
- r1_bh = multipath_alloc_mpbh (conf);
-
- spin_lock_irq(&conf->segment_lock);
- wait_event_lock_irq(conf->wait_done,
- bh->b_rsector < conf->start_active ||
- bh->b_rsector >= conf->start_future,
- conf->segment_lock);
- if (bh->b_rsector < conf->start_active)
- conf->cnt_done++;
- else {
- conf->cnt_future++;
- if (conf->phase)
- set_bit(MPBH_SyncPhase, &r1_bh->state);
- }
- spin_unlock_irq(&conf->segment_lock);
-
- /*
- * i think the read and write branch should be separated completely,
- * since we want to do read balancing on the read side for example.
- * Alternative implementations? :) --mingo
- */
+ mp_bh = multipath_alloc_mpbh (conf);
- r1_bh->master_bh = bh;
- r1_bh->mddev = mddev;
- r1_bh->cmd = rw;
+ mp_bh->master_bh = bh;
+ mp_bh->mddev = mddev;
+ mp_bh->cmd = rw;
/*
* read balancing logic:
*/
multipath = conf->multipaths + multipath_read_balance(conf);
- bh_req = &r1_bh->bh_req;
+ bh_req = &mp_bh->bh_req;
memcpy(bh_req, bh, sizeof(*bh));
bh_req->b_blocknr = bh->b_rsector;
bh_req->b_dev = multipath->dev;
bh_req->b_rdev = multipath->dev;
/* bh_req->b_rsector = bh->n_rsector; */
bh_req->b_end_io = multipath_end_request;
- bh_req->b_private = r1_bh;
+ bh_req->b_private = mp_bh;
generic_make_request (rw, bh_req);
return 0;
}
@@ -540,12 +372,10 @@
mdp_disk_t *spare;
mdp_super_t *sb = mddev->sb;
-// MD_BUG();
spare = get_spare(mddev);
if (spare) {
err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
-// MD_BUG();
}
if (!err && !disk_faulty(spare)) {
multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
@@ -553,7 +383,6 @@
mark_disk_active(spare);
sb->active_disks++;
sb->spare_disks--;
-// MD_BUG();
}
}
}
@@ -697,7 +526,6 @@
case DISKOP_SPARE_WRITE:
sdisk = conf->multipaths + spare_disk;
sdisk->operational = 1;
- sdisk->write_only = 1;
break;
/*
* Deactivate a spare disk:
@@ -705,7 +533,6 @@
case DISKOP_SPARE_INACTIVE:
sdisk = conf->multipaths + spare_disk;
sdisk->operational = 0;
- sdisk->write_only = 0;
break;
/*
* Activate (mark read-write) the (now sync) spare disk,
@@ -757,10 +584,6 @@
spare_rdev = find_rdev_nr(mddev, spare_desc->number);
failed_rdev = find_rdev_nr(mddev, failed_desc->number);
xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
-// if (failed_rdev->alias_device)
-// MD_BUG();
-// if (!spare_rdev->alias_device)
-// MD_BUG();
spare_rdev->alias_device = 0;
failed_rdev->alias_device = 1;
@@ -788,7 +611,6 @@
* this really activates the spare.
*/
fdisk->spare = 0;
- fdisk->write_only = 0;
/*
* if we activate a spare, we definitely replace a
@@ -828,10 +650,8 @@
adisk->dev = MKDEV(added_desc->major,added_desc->minor);
adisk->operational = 0;
- adisk->write_only = 0;
adisk->spare = 1;
adisk->used_slot = 1;
- adisk->head_position = 0;
conf->nr_disks++;
break;
@@ -865,7 +685,7 @@
static void multipathd (void *data)
{
- struct multipath_bh *r1_bh;
+ struct multipath_bh *mp_bh;
struct buffer_head *bh;
unsigned long flags;
mddev_t *mddev;
@@ -874,31 +694,31 @@
for (;;) {
md_spin_lock_irqsave(&retry_list_lock, flags);
- r1_bh = multipath_retry_list;
- if (!r1_bh)
+ mp_bh = multipath_retry_list;
+ if (!mp_bh)
break;
- multipath_retry_list = r1_bh->next_r1;
+ multipath_retry_list = mp_bh->next_mp;
md_spin_unlock_irqrestore(&retry_list_lock, flags);
- mddev = r1_bh->mddev;
+ mddev = mp_bh->mddev;
if (mddev->sb_dirty) {
printk(KERN_INFO "dirty sb detected, updating.\n");
mddev->sb_dirty = 0;
md_update_sb(mddev);
}
- bh = &r1_bh->bh_req;
+ bh = &mp_bh->bh_req;
dev = bh->b_dev;
multipath_map (mddev, &bh->b_dev);
if (bh->b_dev == dev) {
printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
- multipath_end_bh_io(r1_bh, 0);
+ multipath_end_bh_io(mp_bh, 0);
} else {
printk (REDIRECT_SECTOR,
partition_name(bh->b_dev), bh->b_blocknr);
bh->b_rdev = bh->b_dev;
bh->b_rsector = bh->b_blocknr;
- generic_make_request (r1_bh->cmd, bh);
+ generic_make_request (mp_bh->cmd, bh);
}
}
md_spin_unlock_irqrestore(&retry_list_lock, flags);
@@ -1016,7 +836,7 @@
mdp_disk_t *desc, *desc2;
mdk_rdev_t *rdev, *def_rdev = NULL;
struct md_list_head *tmp;
- int start_recovery = 0, num_rdevs = 0;
+ int num_rdevs = 0;
MOD_INC_USE_COUNT;
@@ -1072,12 +892,9 @@
disk->number = desc->number;
disk->raid_disk = desc->raid_disk;
disk->dev = rdev->dev;
- disk->sect_limit = MAX_WORK_PER_DISK;
disk->operational = 0;
- disk->write_only = 0;
disk->spare = 1;
disk->used_slot = 1;
- disk->head_position = 0;
mark_disk_sync(desc);
if (disk_active(desc)) {
@@ -1135,10 +952,7 @@
conf->mddev = mddev;
conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
- conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
init_waitqueue_head(&conf->wait_buffer);
- init_waitqueue_head(&conf->wait_done);
- init_waitqueue_head(&conf->wait_ready);
if (!conf->working_disks) {
printk(NONE_OPERATIONAL, mdidx(mddev));
@@ -1150,17 +964,17 @@
* As a minimum, 1 mpbh and raid_disks buffer_heads
* would probably get us by in tight memory situations,
* but a few more is probably a good idea.
- * For now, try 16 mpbh and 16*raid_disks bufferheads
- * This will allow at least 16 concurrent reads or writes
- * even if kmalloc starts failing
+ * For now, try NR_RESERVED_BUFS mpbh and
+ * NR_RESERVED_BUFS*raid_disks bufferheads
+ * This will allow at least NR_RESERVED_BUFS concurrent
+ * reads or writes even if kmalloc starts failing
*/
- if (multipath_grow_mpbh(conf, 16) < 16 ||
- multipath_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
+ if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
printk(MEM_ERROR, mdidx(mddev));
goto out_free_conf;
}
- if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
+ if ((sb->state & (1 << MD_SB_CLEAN))) {
/*
* we do sanity checks even if the device says
* it's clean ...
@@ -1202,7 +1016,6 @@
out_free_conf:
multipath_shrink_mpbh(conf);
- multipath_shrink_bh(conf, conf->freebh_cnt);
kfree(conf);
mddev->private = NULL;
out:
@@ -1228,7 +1041,6 @@
md_unregister_thread(conf->thread);
multipath_shrink_mpbh(conf);
- multipath_shrink_bh(conf, conf->freebh_cnt);
kfree(conf);
mddev->private = NULL;
MOD_DEC_USE_COUNT;
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)