patch-2.4.10 linux/drivers/md/multipath.c
Next file: linux/drivers/md/raid1.c
Previous file: linux/drivers/md/md.c
Back to the patch index
Back to the overall index
- Lines: 1262
- Date:
Fri Sep 14 14:22:18 2001
- Orig file:
v2.4.9/linux/drivers/md/multipath.c
- Orig date:
Wed Dec 31 16:00:00 1969
diff -u --recursive --new-file v2.4.9/linux/drivers/md/multipath.c linux/drivers/md/multipath.c
@@ -0,0 +1,1261 @@
+/*
+ * multipath.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * MULTIPATH management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/raid/multipath.h>
+#include <asm/atomic.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define MAX_WORK_PER_DISK 128
+
+/*
+ * The following can be used to debug the driver
+ */
+#define MULTIPATH_DEBUG 0
+
+#if MULTIPATH_DEBUG
+#define PRINTK(x...) printk(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...) do { } while (0)
+#endif
+
+
+static mdk_personality_t multipath_personality;
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
+struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
+
+static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
+
+struct buffer_head *multipath_alloc_bh(multipath_conf_t *conf, int cnt)
+{
+ /* return a linked list of "cnt" struct buffer_heads.
+ * don't take any off the free list unless we know we can
+ * get all we need, otherwise we could deadlock
+ */
+ struct buffer_head *bh=NULL;
+
+ while(cnt) {
+ struct buffer_head *t;
+ md_spin_lock_irq(&conf->device_lock);
+ if (conf->freebh_cnt >= cnt)
+ while (cnt) {
+ t = conf->freebh;
+ conf->freebh = t->b_next;
+ t->b_next = bh;
+ bh = t;
+ t->b_state = 0;
+ conf->freebh_cnt--;
+ cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (cnt == 0)
+ break;
+ t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_NOIO);
+ if (t) {
+ memset(t, 0, sizeof(*t));
+ t->b_next = bh;
+ bh = t;
+ cnt--;
+ } else {
+ PRINTK("waiting for %d bh\n", cnt);
+ wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
+ }
+ }
+ return bh;
+}
+
+static inline void multipath_free_bh(multipath_conf_t *conf, struct buffer_head *bh)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ while (bh) {
+ struct buffer_head *t = bh;
+ bh=bh->b_next;
+ if (t->b_pprev == NULL)
+ kfree(t);
+ else {
+ t->b_next= conf->freebh;
+ conf->freebh = t;
+ conf->freebh_cnt++;
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ wake_up(&conf->wait_buffer);
+}
+
+static int multipath_grow_bh(multipath_conf_t *conf, int cnt)
+{
+ /* allocate cnt buffer_heads, possibly less if kalloc fails */
+ int i = 0;
+
+ while (i < cnt) {
+ struct buffer_head *bh;
+ bh = kmalloc(sizeof(*bh), GFP_KERNEL);
+ if (!bh) break;
+ memset(bh, 0, sizeof(*bh));
+
+ md_spin_lock_irq(&conf->device_lock);
+ bh->b_pprev = &conf->freebh;
+ bh->b_next = conf->freebh;
+ conf->freebh = bh;
+ conf->freebh_cnt++;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static int multipath_shrink_bh(multipath_conf_t *conf, int cnt)
+{
+ /* discard cnt buffer_heads, if we can find them */
+ int i = 0;
+
+ md_spin_lock_irq(&conf->device_lock);
+ while ((i < cnt) && conf->freebh) {
+ struct buffer_head *bh = conf->freebh;
+ conf->freebh = bh->b_next;
+ kfree(bh);
+ i++;
+ conf->freebh_cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ return i;
+}
+
+
+static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
+{
+ struct multipath_bh *r1_bh = NULL;
+
+ do {
+ md_spin_lock_irq(&conf->device_lock);
+ if (conf->freer1) {
+ r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
+ r1_bh->state = 0;
+ r1_bh->bh_req.b_state = 0;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (r1_bh)
+ return r1_bh;
+ r1_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
+ GFP_NOIO);
+ if (r1_bh) {
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ return r1_bh;
+ }
+ wait_event(conf->wait_buffer, conf->freer1);
+ } while (1);
+}
+
+static inline void multipath_free_mpbh(struct multipath_bh *r1_bh)
+{
+ struct buffer_head *bh = r1_bh->multipath_bh_list;
+ multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+ r1_bh->multipath_bh_list = NULL;
+
+ if (test_bit(MPBH_PreAlloc, &r1_bh->state)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else {
+ kfree(r1_bh);
+ }
+ multipath_free_bh(conf, bh);
+}
+
+static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
+{
+ int i = 0;
+
+ while (i < cnt) {
+ struct multipath_bh *r1_bh;
+ r1_bh = (struct multipath_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh)
+ break;
+ memset(r1_bh, 0, sizeof(*r1_bh));
+
+ md_spin_lock_irq(&conf->device_lock);
+ set_bit(MPBH_PreAlloc, &r1_bh->state);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static void multipath_shrink_mpbh(multipath_conf_t *conf)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freer1) {
+ struct multipath_bh *r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ kfree(r1_bh);
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void multipath_free_buf(struct multipath_bh *r1_bh)
+{
+ unsigned long flags;
+ struct buffer_head *bh = r1_bh->multipath_bh_list;
+ multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+ r1_bh->multipath_bh_list = NULL;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freebuf;
+ conf->freebuf = r1_bh;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ multipath_free_bh(conf, bh);
+}
+
+static int multipath_map (mddev_t *mddev, kdev_t *rdev)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ int i, disks = MD_SB_DISKS;
+
+ /*
+ * Later we do read balancing on the read side
+ * now we use the first available disk.
+ */
+
+ for (i = 0; i < disks; i++) {
+ if (conf->multipaths[i].operational) {
+ *rdev = conf->multipaths[i].dev;
+ return (0);
+ }
+ }
+
+ printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
+ return (-1);
+}
+
+static void multipath_reschedule_retry (struct multipath_bh *r1_bh)
+{
+ unsigned long flags;
+ mddev_t *mddev = r1_bh->mddev;
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ if (multipath_retry_list == NULL)
+ multipath_retry_tail = &multipath_retry_list;
+ *multipath_retry_tail = r1_bh;
+ multipath_retry_tail = &r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+ md_wakeup_thread(conf->thread);
+}
+
+
+static void inline io_request_done(unsigned long sector, multipath_conf_t *conf, int phase)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector < conf->start_active)
+ conf->cnt_done--;
+ else if (sector >= conf->start_future && conf->phase == phase)
+ conf->cnt_future--;
+ else if (!--conf->cnt_pending)
+ wake_up(&conf->wait_ready);
+
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+static void inline sync_request_done (unsigned long sector, multipath_conf_t *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector >= conf->start_ready)
+ --conf->cnt_ready;
+ else if (sector >= conf->start_active) {
+ if (!--conf->cnt_active) {
+ conf->start_active = conf->start_ready;
+ wake_up(&conf->wait_done);
+ }
+ }
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+/*
+ * multipath_end_bh_io() is called when we have finished servicing a multipathed
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void multipath_end_bh_io (struct multipath_bh *r1_bh, int uptodate)
+{
+ struct buffer_head *bh = r1_bh->master_bh;
+
+ io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
+ test_bit(MPBH_SyncPhase, &r1_bh->state));
+
+ bh->b_end_io(bh, uptodate);
+ multipath_free_mpbh(r1_bh);
+}
+
+void multipath_end_request (struct buffer_head *bh, int uptodate)
+{
+ struct multipath_bh * r1_bh = (struct multipath_bh *)(bh->b_private);
+
+ /*
+ * this branch is our 'one multipath IO has finished' event handler:
+ */
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ else
+ /*
+ * Set MPBH_Uptodate in our master buffer_head, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other multipathed buffer fails.
+ *
+ * The 'master' represents the complex operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' buffer_head.
+ */
+ set_bit (MPBH_Uptodate, &r1_bh->state);
+
+
+ if (uptodate) {
+ multipath_end_bh_io(r1_bh, uptodate);
+ return;
+ }
+ /*
+ * oops, IO error:
+ */
+ printk(KERN_ERR "multipath: %s: rescheduling block %lu\n",
+ partition_name(bh->b_dev), bh->b_blocknr);
+ multipath_reschedule_retry(r1_bh);
+ return;
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 multipaths in the same 2 devices, performance
+ * degrades dramatically because position is multipath, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+static int multipath_read_balance (multipath_conf_t *conf)
+{
+ int disk;
+
+ for (disk = 0; disk < conf->raid_disks; disk++)
+ if (conf->multipaths[disk].operational)
+ return disk;
+ BUG();
+ return 0;
+}
+
+static int multipath_make_request (mddev_t *mddev, int rw,
+ struct buffer_head * bh)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ struct buffer_head *bh_req;
+ struct multipath_bh * r1_bh;
+ struct multipath_info *multipath;
+
+ if (!buffer_locked(bh))
+ BUG();
+
+/*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ * Currently, just replace the command with READ/WRITE.
+ */
+ if (rw == READA)
+ rw = READ;
+
+ r1_bh = multipath_alloc_mpbh (conf);
+
+ spin_lock_irq(&conf->segment_lock);
+ wait_event_lock_irq(conf->wait_done,
+ bh->b_rsector < conf->start_active ||
+ bh->b_rsector >= conf->start_future,
+ conf->segment_lock);
+ if (bh->b_rsector < conf->start_active)
+ conf->cnt_done++;
+ else {
+ conf->cnt_future++;
+ if (conf->phase)
+ set_bit(MPBH_SyncPhase, &r1_bh->state);
+ }
+ spin_unlock_irq(&conf->segment_lock);
+
+ /*
+ * i think the read and write branch should be separated completely,
+ * since we want to do read balancing on the read side for example.
+ * Alternative implementations? :) --mingo
+ */
+
+ r1_bh->master_bh = bh;
+ r1_bh->mddev = mddev;
+ r1_bh->cmd = rw;
+
+ /*
+ * read balancing logic:
+ */
+ multipath = conf->multipaths + multipath_read_balance(conf);
+
+ bh_req = &r1_bh->bh_req;
+ memcpy(bh_req, bh, sizeof(*bh));
+ bh_req->b_blocknr = bh->b_rsector;
+ bh_req->b_dev = multipath->dev;
+ bh_req->b_rdev = multipath->dev;
+/* bh_req->b_rsector = bh->n_rsector; */
+ bh_req->b_end_io = multipath_end_request;
+ bh_req->b_private = r1_bh;
+ generic_make_request (rw, bh_req);
+ return 0;
+}
+
+static int multipath_status (char *page, mddev_t *mddev)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ int sz = 0, i;
+
+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
+ conf->working_disks);
+ for (i = 0; i < conf->raid_disks; i++)
+ sz += sprintf (page+sz, "%s",
+ conf->multipaths[i].operational ? "U" : "_");
+ sz += sprintf (page+sz, "]");
+ return sz;
+}
+
+#define LAST_DISK KERN_ALERT \
+"multipath: only one IO path left and IO error.\n"
+
+#define NO_SPARE_DISK KERN_ALERT \
+"multipath: no spare IO path left!\n"
+
+#define DISK_FAILED KERN_ALERT \
+"multipath: IO failure on %s, disabling IO path. \n" \
+" Operation continuing on %d IO paths.\n"
+
+static void mark_disk_bad (mddev_t *mddev, int failed)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ struct multipath_info *multipath = conf->multipaths+failed;
+ mdp_super_t *sb = mddev->sb;
+
+ multipath->operational = 0;
+ mark_disk_faulty(sb->disks+multipath->number);
+ mark_disk_nonsync(sb->disks+multipath->number);
+ mark_disk_inactive(sb->disks+multipath->number);
+ sb->active_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(conf->thread);
+ conf->working_disks--;
+ printk (DISK_FAILED, partition_name (multipath->dev),
+ conf->working_disks);
+}
+
+/*
+ * Careful, this can execute in IRQ contexts as well!
+ */
+static int multipath_error (mddev_t *mddev, kdev_t dev)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ struct multipath_info * multipaths = conf->multipaths;
+ int disks = MD_SB_DISKS;
+ int other_paths = 1;
+ int i;
+
+ if (conf->working_disks == 1) {
+ other_paths = 0;
+ for (i = 0; i < disks; i++) {
+ if (multipaths[i].spare) {
+ other_paths = 1;
+ break;
+ }
+ }
+ }
+
+ if (!other_paths) {
+ /*
+ * Uh oh, we can do nothing if this is our last path, but
+ * first check if this is a queued request for a device
+ * which has just failed.
+ */
+ for (i = 0; i < disks; i++) {
+ if (multipaths[i].dev==dev && !multipaths[i].operational)
+ return 0;
+ }
+ printk (LAST_DISK);
+ } else {
+ /*
+ * Mark disk as unusable
+ */
+ for (i = 0; i < disks; i++) {
+ if (multipaths[i].dev==dev && multipaths[i].operational) {
+ mark_disk_bad(mddev, i);
+ break;
+ }
+ }
+ if (!conf->working_disks) {
+ int err = 1;
+ mdp_disk_t *spare;
+ mdp_super_t *sb = mddev->sb;
+
+// MD_BUG();
+ spare = get_spare(mddev);
+ if (spare) {
+ err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
+ printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
+// MD_BUG();
+ }
+ if (!err && !disk_faulty(spare)) {
+ multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(spare);
+ mark_disk_active(spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+// MD_BUG();
+ }
+ }
+ }
+ return 0;
+}
+
+#undef LAST_DISK
+#undef NO_SPARE_DISK
+#undef DISK_FAILED
+
+
+static void print_multipath_conf (multipath_conf_t *conf)
+{
+ int i;
+ struct multipath_info *tmp;
+
+ printk("MULTIPATH conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
+ conf->raid_disks, conf->nr_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->multipaths + i;
+ if (tmp->spare || tmp->operational || tmp->number ||
+ tmp->raid_disk || tmp->used_slot)
+ printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+ i, tmp->spare,tmp->operational,
+ tmp->number,tmp->raid_disk,tmp->used_slot,
+ partition_name(tmp->dev));
+ }
+}
+
+static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+ int err = 0;
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+ multipath_conf_t *conf = mddev->private;
+ struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+ mdk_rdev_t *spare_rdev, *failed_rdev;
+
+ print_multipath_conf(conf);
+ md_spin_lock_irq(&conf->device_lock);
+ /*
+ * find the disk ...
+ */
+ switch (state) {
+
+ case DISKOP_SPARE_ACTIVE:
+
+ /*
+ * Find the failed disk within the MULTIPATH configuration ...
+ * (this can only be in the first conf->working_disks part)
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ tmp = conf->multipaths + i;
+ if ((!tmp->operational && !tmp->spare) ||
+ !tmp->used_slot) {
+ failed_disk = i;
+ break;
+ }
+ }
+ /*
+ * When we activate a spare disk we _must_ have a disk in
+ * the lower (active) part of the array to replace.
+ */
+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ /* fall through */
+
+ case DISKOP_SPARE_WRITE:
+ case DISKOP_SPARE_INACTIVE:
+
+ /*
+ * Find the spare disk ... (can only be in the 'high'
+ * area of the array)
+ */
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->multipaths + i;
+ if (tmp->spare && tmp->number == (*d)->number) {
+ spare_disk = i;
+ break;
+ }
+ }
+ if (spare_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->multipaths + i;
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
+ if (tmp->operational) {
+ printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
+ err = -EBUSY;
+ goto abort;
+ }
+ removed_disk = i;
+ break;
+ }
+ }
+ if (removed_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->multipaths + i;
+ if (!tmp->used_slot) {
+ added_disk = i;
+ break;
+ }
+ }
+ if (added_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+ }
+
+ switch (state) {
+ /*
+ * Switch the spare disk to write-only mode:
+ */
+ case DISKOP_SPARE_WRITE:
+ sdisk = conf->multipaths + spare_disk;
+ sdisk->operational = 1;
+ sdisk->write_only = 1;
+ break;
+ /*
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+ sdisk = conf->multipaths + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+ break;
+ /*
+ * Activate (mark read-write) the (now sync) spare disk,
+ * which means we switch it's 'raid position' (->raid_disk)
+ * with the failed disk. (only the first 'conf->nr_disks'
+ * slots are used for 'real' disks and we must preserve this
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+ sdisk = conf->multipaths + spare_disk;
+ fdisk = conf->multipaths + failed_disk;
+
+ spare_desc = &sb->disks[sdisk->number];
+ failed_desc = &sb->disks[fdisk->number];
+
+ if (spare_desc != *d) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (sdisk->raid_disk != spare_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (fdisk->raid_disk != failed_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ /*
+ * do the switch finally
+ */
+ spare_rdev = find_rdev_nr(mddev, spare_desc->number);
+ failed_rdev = find_rdev_nr(mddev, failed_desc->number);
+ xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
+// if (failed_rdev->alias_device)
+// MD_BUG();
+// if (!spare_rdev->alias_device)
+// MD_BUG();
+ spare_rdev->alias_device = 0;
+ failed_rdev->alias_device = 1;
+
+ xchg_values(*spare_desc, *failed_desc);
+ xchg_values(*fdisk, *sdisk);
+
+ /*
+ * (careful, 'failed' and 'spare' are switched from now on)
+ *
+ * we want to preserve linear numbering and we want to
+ * give the proper raid_disk number to the now activated
+ * disk. (this means we switch back these values)
+ */
+
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+ xchg_values(spare_desc->number, failed_desc->number);
+ xchg_values(sdisk->number, fdisk->number);
+
+ *d = failed_desc;
+
+ if (sdisk->dev == MKDEV(0,0))
+ sdisk->used_slot = 0;
+ /*
+ * this really activates the spare.
+ */
+ fdisk->spare = 0;
+ fdisk->write_only = 0;
+
+ /*
+ * if we activate a spare, we definitely replace a
+ * non-operational disk slot in the 'low' area of
+ * the disk array.
+ */
+
+ conf->working_disks++;
+
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+ rdisk = conf->multipaths + removed_disk;
+
+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ rdisk->dev = MKDEV(0,0);
+ rdisk->used_slot = 0;
+ conf->nr_disks--;
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+ adisk = conf->multipaths + added_disk;
+ added_desc = *d;
+
+ if (added_disk != added_desc->number) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
+ adisk->spare = 1;
+ adisk->used_slot = 1;
+ adisk->head_position = 0;
+ conf->nr_disks++;
+
+ break;
+
+ default:
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+abort:
+ md_spin_unlock_irq(&conf->device_lock);
+
+ print_multipath_conf(conf);
+ return err;
+}
+
+
+#define IO_ERROR KERN_ALERT \
+"multipath: %s: unrecoverable IO read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"multipath: %s: redirecting sector %lu to another IO path\n"
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working multipaths.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+
+static void multipathd (void *data)
+{
+ struct multipath_bh *r1_bh;
+ struct buffer_head *bh;
+ unsigned long flags;
+ mddev_t *mddev;
+ kdev_t dev;
+
+
+ for (;;) {
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ r1_bh = multipath_retry_list;
+ if (!r1_bh)
+ break;
+ multipath_retry_list = r1_bh->next_r1;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+
+ mddev = r1_bh->mddev;
+ if (mddev->sb_dirty) {
+ printk(KERN_INFO "dirty sb detected, updating.\n");
+ mddev->sb_dirty = 0;
+ md_update_sb(mddev);
+ }
+ bh = &r1_bh->bh_req;
+ dev = bh->b_dev;
+
+ multipath_map (mddev, &bh->b_dev);
+ if (bh->b_dev == dev) {
+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+ multipath_end_bh_io(r1_bh, 0);
+ } else {
+ printk (REDIRECT_SECTOR,
+ partition_name(bh->b_dev), bh->b_blocknr);
+ bh->b_rdev = bh->b_dev;
+ bh->b_rsector = bh->b_blocknr;
+ generic_make_request (r1_bh->cmd, bh);
+ }
+ }
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+/*
+ * This will catch the scenario in which one of the multipaths was
+ * mounted as a normal device rather than as a part of a raid set.
+ *
+ * check_consistency is very personality-dependent, eg. RAID5 cannot
+ * do this check, it uses another method.
+ */
+static int __check_consistency (mddev_t *mddev, int row)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ int disks = MD_SB_DISKS;
+ kdev_t dev;
+ struct buffer_head *bh = NULL;
+ int i, rc = 0;
+ char *buffer = NULL;
+
+ for (i = 0; i < disks; i++) {
+ if (!conf->multipaths[i].operational)
+ continue;
+ printk("(checking disk %d)\n",i);
+ dev = conf->multipaths[i].dev;
+ set_blocksize(dev, 4096);
+ if ((bh = bread(dev, row / 4, 4096)) == NULL)
+ break;
+ if (!buffer) {
+ buffer = (char *) __get_free_page(GFP_KERNEL);
+ if (!buffer)
+ break;
+ memcpy(buffer, bh->b_data, 4096);
+ } else if (memcmp(buffer, bh->b_data, 4096)) {
+ rc = 1;
+ break;
+ }
+ bforget(bh);
+ fsync_dev(dev);
+ invalidate_buffers(dev);
+ bh = NULL;
+ }
+ if (buffer)
+ free_page((unsigned long) buffer);
+ if (bh) {
+ dev = bh->b_dev;
+ bforget(bh);
+ fsync_dev(dev);
+ invalidate_buffers(dev);
+ }
+ return rc;
+}
+
+static int check_consistency (mddev_t *mddev)
+{
+ if (__check_consistency(mddev, 0))
+/*
+ * we do not do this currently, as it's perfectly possible to
+ * have an inconsistent array when it's freshly created. Only
+ * newly written data has to be consistent.
+ */
+ return 0;
+
+ return 0;
+}
+
+#define INVALID_LEVEL KERN_WARNING \
+"multipath: md%d: raid level not set to multipath IO (%d)\n"
+
+#define NO_SB KERN_ERR \
+"multipath: disabled IO path %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"multipath: disabled IO path %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"multipath: making IO path %s a spare path (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"multipath: disabled IO path %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"multipath: disabled IO path %s (multipath %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"multipath: device %s operational as IO path %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"multipath: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"multipath: spare IO path %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"multipath: no operational IO paths for md%d\n"
+
+#define SB_DIFFERENCES KERN_ERR \
+"multipath: detected IO path differences!\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
+
+#define THREAD_ERROR KERN_ERR \
+"multipath: couldn't allocate thread for md%d\n"
+
+static int multipath_run (mddev_t *mddev)
+{
+ multipath_conf_t *conf;
+ int i, j, disk_idx;
+ struct multipath_info *disk, *disk2;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *desc, *desc2;
+ mdk_rdev_t *rdev, *def_rdev = NULL;
+ struct md_list_head *tmp;
+ int start_recovery = 0, num_rdevs = 0;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != -4) {
+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
+ goto out;
+ }
+ /*
+ * copy the already verified devices into our private MULTIPATH
+ * bookkeeping area. [whatever we allocate in multipath_run(),
+ * should be freed in multipath_stop()]
+ */
+
+ conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+ mddev->private = conf;
+ if (!conf) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out;
+ }
+ memset(conf, 0, sizeof(*conf));
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ /* this is a "should never happen" case and if it */
+ /* ever does happen, a continue; won't help */
+ printk(ERRORS, partition_name(rdev->dev));
+ continue;
+ } else {
+ /* this is a "should never happen" case and if it */
+ /* ever does happen, a continue; won't help */
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ }
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ continue;
+ }
+
+ desc = &sb->disks[rdev->desc_nr];
+ disk_idx = desc->raid_disk;
+ disk = conf->multipaths + disk_idx;
+
+ if (!disk_sync(desc))
+ printk(NOT_IN_SYNC, partition_name(rdev->dev));
+
+ /*
+ * Mark all disks as spare to start with, then pick our
+ * active disk. If we have a disk that is marked active
+ * in the sb, then use it, else use the first rdev.
+ */
+ disk->number = desc->number;
+ disk->raid_disk = desc->raid_disk;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 1;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ mark_disk_sync(desc);
+
+ if (disk_active(desc)) {
+ if(!conf->working_disks) {
+ printk(OPERATIONAL, partition_name(rdev->dev),
+ desc->raid_disk);
+ disk->operational = 1;
+ disk->spare = 0;
+ conf->working_disks++;
+ def_rdev = rdev;
+ } else {
+ mark_disk_spare(desc);
+ }
+ } else
+ mark_disk_spare(desc);
+
+ if(!num_rdevs++) def_rdev = rdev;
+ }
+ if(!conf->working_disks && num_rdevs) {
+ desc = &sb->disks[def_rdev->desc_nr];
+ disk = conf->multipaths + desc->raid_disk;
+ printk(OPERATIONAL, partition_name(def_rdev->dev),
+ disk->raid_disk);
+ disk->operational = 1;
+ disk->spare = 0;
+ conf->working_disks++;
+ mark_disk_active(desc);
+ }
+ /*
+ * Make sure our active path is in desc spot 0
+ */
+ if(def_rdev->desc_nr != 0) {
+ rdev = find_rdev_nr(mddev, 0);
+ desc = &sb->disks[def_rdev->desc_nr];
+ desc2 = sb->disks;
+ disk = conf->multipaths + desc->raid_disk;
+ disk2 = conf->multipaths + desc2->raid_disk;
+ xchg_values(*desc2,*desc);
+ xchg_values(*disk2,*disk);
+ xchg_values(desc2->number, desc->number);
+ xchg_values(disk2->number, disk->number);
+ xchg_values(desc2->raid_disk, desc->raid_disk);
+ xchg_values(disk2->raid_disk, disk->raid_disk);
+ if(rdev) {
+ xchg_values(def_rdev->desc_nr,rdev->desc_nr);
+ } else {
+ def_rdev->desc_nr = 0;
+ }
+ }
+ conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
+ conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
+ sb->failed_disks = 0;
+ sb->spare_disks = num_rdevs - 1;
+ mddev->sb_dirty = 1;
+ conf->mddev = mddev;
+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+
+ conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&conf->wait_buffer);
+ init_waitqueue_head(&conf->wait_done);
+ init_waitqueue_head(&conf->wait_ready);
+
+ if (!conf->working_disks) {
+ printk(NONE_OPERATIONAL, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+
+ /* pre-allocate some buffer_head structures.
+ * As a minimum, 1 mpbh and raid_disks buffer_heads
+ * would probably get us by in tight memory situations,
+ * but a few more is probably a good idea.
+ * For now, try 16 mpbh and 16*raid_disks bufferheads
+ * This will allow at least 16 concurrent reads or writes
+ * even if kmalloc starts failing
+ */
+ if (multipath_grow_mpbh(conf, 16) < 16 ||
+ multipath_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
+ /*
+ * we do sanity checks even if the device says
+ * it's clean ...
+ */
+ if (check_consistency(mddev)) {
+ printk(SB_DIFFERENCES);
+ sb->state &= ~(1 << MD_SB_CLEAN);
+ }
+ }
+
+ {
+ const char * name = "multipathd";
+
+ conf->thread = md_register_thread(multipathd, conf, name);
+ if (!conf->thread) {
+ printk(THREAD_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+ }
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mark_disk_nonsync(sb->disks+i);
+ for (j = 0; j < sb->raid_disks; j++) {
+ if (sb->disks[i].number == conf->multipaths[j].number)
+ mark_disk_sync(sb->disks+i);
+ }
+ }
+
+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
+ sb->raid_disks, sb->spare_disks);
+ /*
+ * Ok, everything is just fine now
+ */
+ return 0;
+
+out_free_conf:
+ multipath_shrink_mpbh(conf);
+ multipath_shrink_bh(conf, conf->freebh_cnt);
+ kfree(conf);
+ mddev->private = NULL;
+out:
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef SB_DIFFERENCES
+#undef ARRAY_IS_ACTIVE
+
+static int multipath_stop (mddev_t *mddev)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+
+ md_unregister_thread(conf->thread);
+ multipath_shrink_mpbh(conf);
+ multipath_shrink_bh(conf, conf->freebh_cnt);
+ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+static mdk_personality_t multipath_personality=
+{
+ name: "multipath",
+ make_request: multipath_make_request,
+ run: multipath_run,
+ stop: multipath_stop,
+ status: multipath_status,
+ error_handler: multipath_error,
+ diskop: multipath_diskop,
+};
+
+static int md__init multipath_init (void)
+{
+ return register_md_personality (MULTIPATH, &multipath_personality);
+}
+
+static void multipath_exit (void)
+{
+ unregister_md_personality (MULTIPATH);
+}
+
+module_init(multipath_init);
+module_exit(multipath_exit);
+
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)