patch-2.1.63 linux/drivers/block/md.c

Next file: linux/drivers/block/raid0.c
Previous file: linux/drivers/block/ll_rw_blk.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.62/linux/drivers/block/md.c linux/drivers/block/md.c
@@ -9,6 +9,9 @@
 
    kerneld support by Boris Tobotras <boris@xtalk.msk.su>
 
+   RAID-1/RAID-5 extensions by:
+        Ingo Molnar, Miguel de Icaza, Gadi Oxman
+   
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2, or (at your option)
@@ -19,6 +22,13 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+/*
+ * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
+ * the extra system load does not show up that much. Increase it if your
+ * system can take more.
+ */
+#define SPEED_LIMIT 1024
+
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/version.h>
@@ -31,20 +41,31 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
+#include <linux/smp_lock.h>
 #ifdef CONFIG_KERNELD
 #include <linux/kerneld.h>
 #endif
 #include <linux/errno.h>
 #include <linux/init.h>
 
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
 #define MAJOR_NR MD_MAJOR
 #define MD_DRIVER
 
 #include <linux/blk.h>
 #include <asm/uaccess.h>
+#include <asm/bitops.h>
+#include <asm/atomic.h>
 
 static struct hd_struct md_hd_struct[MAX_MD_DEV];
 static int md_blocksizes[MAX_MD_DEV];
+int md_maxreadahead[MAX_MD_DEV];
+static struct md_thread md_threads[MAX_MD_THREADS];
+#if SUPPORT_RECONSTRUCTION
+static struct md_thread *md_sync_thread = NULL;
+#endif /* SUPPORT_RECONSTRUCTION */
 
 int md_size[MAX_MD_DEV]={0, };
 
@@ -66,7 +87,6 @@
 };
 
 static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
-
 struct md_dev md_dev[MAX_MD_DEV];
 
 static struct gendisk *find_gendisk (kdev_t dev)
@@ -84,7 +104,6 @@
   return (NULL);
 }
 
-
 char *partition_name (kdev_t dev)
 {
   static char name[40];		/* This should be long
@@ -93,49 +112,318 @@
 
   if (!hd)
   {
-    printk ("No gendisk entry for dev %s\n", kdevname(dev));
-    sprintf (name, "dev %s", kdevname(dev));
+    sprintf (name, "[dev %s]", kdevname(dev));
     return (name);
   }
 
   return disk_name (hd, MINOR(dev), name);  /* routine in genhd.c */
 }
 
+static int legacy_raid_sb (int minor, int pnum)
+{
+	int i, factor;
+
+	factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+
+	/*****
+	 * do size and offset calculations.
+	 */
+	for (i=0; i<md_dev[minor].nb_dev; i++) {
+		md_dev[minor].devices[i].size &= ~(factor - 1);
+		md_size[minor] += md_dev[minor].devices[i].size;
+		md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + 
+							md_dev[minor].devices[i-1].size) : 0;
+	}
+	if (pnum == RAID0 >> PERSONALITY_SHIFT)
+		md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
+	return 0;
+}
 
-static void set_ra (void)
+static void free_sb (struct md_dev *mddev)
 {
-  int i, j, minra=INT_MAX;
+	int i;
+	struct real_dev *realdev;
 
-  for (i=0; i<MAX_MD_DEV; i++)
-  {
-    if (!md_dev[i].pers)
-      continue;
-    
-    for (j=0; j<md_dev[i].nb_dev; j++)
-      if (read_ahead[MAJOR(md_dev[i].devices[j].dev)]<minra)
-	minra=read_ahead[MAJOR(md_dev[i].devices[j].dev)];
-  }
-  
-  read_ahead[MD_MAJOR]=minra;
+	if (mddev->sb) {
+		free_page((unsigned long) mddev->sb);
+		mddev->sb = NULL;
+	}
+	for (i = 0; i <mddev->nb_dev; i++) {
+		realdev = mddev->devices + i;
+		if (realdev->sb) {
+			free_page((unsigned long) realdev->sb);
+			realdev->sb = NULL;
+		}
+	}
 }
 
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+#define BAD_MAGIC KERN_ERR \
+"md: %s: invalid raid superblock magic (%x) on block %u\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define SUCCESS 0
+#define FAILURE -1
+
+static int analyze_one_sb (struct real_dev * rdev)
+{
+	int ret = FAILURE;
+	struct buffer_head *bh;
+	kdev_t dev = rdev->dev;
+	md_superblock_t *sb;
+
+	/*
+	 * Read the superblock, it's at the end of the disk
+	 */
+	rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
+	set_blocksize (dev, MD_SB_BYTES);
+	bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+
+	if (bh) {
+		sb = (md_superblock_t *) bh->b_data;
+		if (sb->md_magic != MD_SB_MAGIC) {
+			printk (BAD_MAGIC, kdevname(dev),
+					 sb->md_magic, rdev->sb_offset);
+			goto abort;
+		}
+		rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
+		if (!rdev->sb) {
+			printk (OUT_OF_MEM);
+			goto abort;
+		}
+		memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
+
+		rdev->size = sb->size;
+	} else
+		printk (NO_DEVICE,kdevname(rdev->dev));
+	ret = SUCCESS;
+abort:
+	if (bh)
+		brelse (bh);
+	return ret;
+}
+
+#undef SUCCESS
+#undef FAILURE
+
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: superblock inconsistency -- run ckraid\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistenty -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: %s: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN KERN_ERR \
+"md: %s: raid array is not clean -- run ckraid\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: %s: raid array is not clean -- reconstructing parity\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: %s: unsupported raid level %d\n"
+
+static int analyze_sbs (int minor, int pnum)
+{
+	struct md_dev *mddev = md_dev + minor;
+	int i, N = mddev->nb_dev, out_of_date = 0;
+	struct real_dev * disks = mddev->devices;
+	md_superblock_t *sb, *freshest = NULL;
+
+	/*
+	 * RAID-0 and linear don't use a RAID superblock
+	 */
+	if (pnum == RAID0 >> PERSONALITY_SHIFT ||
+		pnum == LINEAR >> PERSONALITY_SHIFT)
+			return legacy_raid_sb (minor, pnum);
+
+	/*
+	 * Verify the RAID superblock on each real device
+	 */
+	for (i = 0; i < N; i++)
+		if (analyze_one_sb(disks+i))
+			goto abort;
+
+	/*
+	 * The superblock constant part has to be the same
+	 * for all disks in the array.
+	 */
+	sb = NULL;
+	for (i = 0; i < N; i++) {
+		if (!disks[i].sb)
+			continue;
+		if (!sb) {
+			sb = disks[i].sb;
+			continue;
+		}
+		if (memcmp(sb,
+			   disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
+			printk (INCONSISTENT);
+			goto abort;
+		}
+	}
+
+	/*
+	 * Ok, we have all disks and the array is ready to run. Lets
+	 * find the freshest superblock, that one will be the superblock
+	 * that represents the whole array.
+	 */
+	if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
+		goto abort;
+	freshest = NULL;
+	for (i = 0; i < N; i++) {
+		if (!disks[i].sb)
+			continue;
+		if (!freshest) {
+			freshest = disks[i].sb;
+			continue;
+		}
+		/*
+		 * Find the newest superblock version
+		 */
+		if (disks[i].sb->utime != freshest->utime) {
+			out_of_date = 1;
+			if (disks[i].sb->utime > freshest->utime)
+				freshest = disks[i].sb;
+		}
+	}
+	if (out_of_date)
+		printk(OUT_OF_DATE);
+	memcpy (sb, freshest, sizeof(*freshest));
+
+	/*
+	 * Check if we can support this RAID array
+	 */
+	if (sb->major_version != MD_MAJOR_VERSION ||
+			sb->minor_version > MD_MINOR_VERSION) {
+
+		printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
+				sb->major_version, sb->minor_version,
+				sb->patch_version);
+		goto abort;
+	}
+
+	/*
+	 * We need to add this as a superblock option.
+	 */
+#if SUPPORT_RECONSTRUCTION
+	if (sb->state != (1 << MD_SB_CLEAN)) {
+		if (sb->level == 1) {
+			printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
+			goto abort;
+		} else
+			printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
+	}
+#else
+	if (sb->state != (1 << MD_SB_CLEAN)) {
+		printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
+		goto abort;
+	}
+#endif /* SUPPORT_RECONSTRUCTION */
+
+	switch (sb->level) {
+		case 1:
+			md_size[minor] = sb->size;
+			md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
+			break;
+		case 4:
+		case 5:
+			md_size[minor] = sb->size * (sb->raid_disks - 1);
+			md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
+			break;
+		default:
+			printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
+					sb->level);
+			goto abort;
+	}
+	return 0;
+abort:
+	free_sb(mddev);
+	return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef NOT_CLEAN
+#undef OLD_LEVEL
+
+int md_update_sb(int minor)
+{
+	struct md_dev *mddev = md_dev + minor;
+	struct buffer_head *bh;
+	md_superblock_t *sb = mddev->sb;
+	struct real_dev *realdev;
+	kdev_t dev;
+	int i;
+	u32 sb_offset;
+
+	sb->utime = CURRENT_TIME;
+	for (i = 0; i < mddev->nb_dev; i++) {
+		realdev = mddev->devices + i;
+		if (!realdev->sb)
+			continue;
+		dev = realdev->dev;
+		sb_offset = realdev->sb_offset;
+		set_blocksize(dev, MD_SB_BYTES);
+		printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
+		bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+		if (bh) {
+			sb = (md_superblock_t *) bh->b_data;
+			memcpy(sb, mddev->sb, MD_SB_BYTES);
+			memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
+			mark_buffer_uptodate(bh, 1);
+			mark_buffer_dirty(bh, 1);
+			ll_rw_block(WRITE, 1, &bh);
+			wait_on_buffer(bh);
+			bforget(bh);
+			fsync_dev(dev);
+			invalidate_buffers(dev);
+		} else
+			printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
+	}
+	return 0;
+}
 
 static int do_md_run (int minor, int repart)
 {
-  int pnum, i, min, current_ra, err;
-  
+  int pnum, i, min, factor, err;
+
   if (!md_dev[minor].nb_dev)
     return -EINVAL;
   
   if (md_dev[minor].pers)
     return -EBUSY;
-  
+
   md_dev[minor].repartition=repart;
   
-  if ((pnum=PERSONALITY(md_dev+minor) >> (PERSONALITY_SHIFT))
+  if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
       >= MAX_PERSONALITY)
     return -EINVAL;
-  
+
+  /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
+  if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
+	  for (i = 0; i < md_dev [minor].nb_dev; i++)
+		  if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
+			  return -EINVAL;
+  }
   if (!pers[pnum])
   {
 #ifdef CONFIG_KERNELD
@@ -147,7 +435,7 @@
       return -EINVAL;
   }
   
-  min=1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+  factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
   
   for (i=0; i<md_dev[minor].nb_dev; i++)
     if (md_dev[minor].devices[i].size<min)
@@ -156,118 +444,167 @@
 	      partition_name (md_dev[minor].devices[i].dev), min);
       return -EINVAL;
     }
+
+  for (i=0; i<md_dev[minor].nb_dev; i++) {
+    fsync_dev(md_dev[minor].devices[i].dev);
+    invalidate_buffers(md_dev[minor].devices[i].dev);
+  }
   
   /* Resize devices according to the factor. It is used to align
      partitions size on a given chunk size. */
   md_size[minor]=0;
-  
-  for (i=0; i<md_dev[minor].nb_dev; i++)
-  {
-    md_dev[minor].devices[i].size &= ~(min - 1);
-    md_size[minor] += md_dev[minor].devices[i].size;
-    md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + md_dev[minor].devices[i-1].size) : 0;
-  }
+
+  /*
+   * Analyze the raid superblock
+   */ 
+  if (analyze_sbs(minor, pnum))
+    return -EINVAL;
 
   md_dev[minor].pers=pers[pnum];
   
   if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
   {
     md_dev[minor].pers=NULL;
+    free_sb(md_dev + minor);
     return (err);
   }
-  
+
+  if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
+  {
+    md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
+    md_update_sb(minor);
+  }
+
   /* FIXME : We assume here we have blocks
      that are twice as large as sectors.
      THIS MAY NOT BE TRUE !!! */
   md_hd_struct[minor].start_sect=0;
   md_hd_struct[minor].nr_sects=md_size[minor]<<1;
   
-  /* It would be better to have a per-md-dev read_ahead. Currently,
-     we only use the smallest read_ahead among md-attached devices */
-  
-  current_ra=read_ahead[MD_MAJOR];
-  
-  for (i=0; i<md_dev[minor].nb_dev; i++)
-    if (current_ra>read_ahead[MAJOR(md_dev[minor].devices[i].dev)])
-      current_ra=read_ahead[MAJOR(md_dev[minor].devices[i].dev)];
-  
-  read_ahead[MD_MAJOR]=current_ra;
-  
-  printk ("START_DEV md%x %s\n", minor, md_dev[minor].pers->name);
+  read_ahead[MD_MAJOR] = 128;
   return (0);
 }
 
-
 static int do_md_stop (int minor, struct inode *inode)
 {
-  int i;
+	int i;
   
-  if (inode->i_count > 1 || md_dev[minor].busy>1) /* ioctl : one open channel */
-  {
-    printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n", minor,
-	    inode->i_count, md_dev[minor].busy);
-    return -EBUSY;
-  }
-  
-  if (md_dev[minor].pers)
-  {
-    /*  The device won't exist anymore -> flush it now */
-    fsync_dev (inode->i_rdev);
-    invalidate_buffers (inode->i_rdev);
-    md_dev[minor].pers->stop (minor, md_dev+minor);
-  }
-  
-  /* Remove locks. */
-  for (i=0; i<md_dev[minor].nb_dev; i++)
-    clear_inode (md_dev[minor].devices[i].inode);
+	if (inode->i_count>1 || md_dev[minor].busy>1) {
+		/*
+		 * ioctl : one open channel
+		 */
+		printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
+				minor, inode->i_count, md_dev[minor].busy);
+		return -EBUSY;
+	}
+  
+	if (md_dev[minor].pers) {
+		/*
+		 * It is safe to call stop here, it only frees private
+		 * data. Also, it tells us if a device is unstoppable
+		 * (eg. resyncing is in progress)
+		 */
+		if (md_dev[minor].pers->stop (minor, md_dev+minor))
+			return -EBUSY;
+		/*
+		 *  The device won't exist anymore -> flush it now
+		 */
+		fsync_dev (inode->i_rdev);
+		invalidate_buffers (inode->i_rdev);
+		if (md_dev[minor].sb) {
+			md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
+			md_update_sb(minor);
+		}
+	}
+  
+	/* Remove locks. */
+	if (md_dev[minor].sb)
+	free_sb(md_dev + minor);
+	for (i=0; i<md_dev[minor].nb_dev; i++)
+		clear_inode (md_dev[minor].devices[i].inode);
+
+	md_dev[minor].nb_dev=md_size[minor]=0;
+	md_hd_struct[minor].nr_sects=0;
+	md_dev[minor].pers=NULL;
   
-  md_dev[minor].nb_dev=md_size[minor]=0;
-  md_hd_struct[minor].nr_sects=0;
-  md_dev[minor].pers=NULL;
+	read_ahead[MD_MAJOR] = 128;
   
-  set_ra ();			/* calculate new read_ahead */
-  
-  printk ("STOP_DEV md%x\n", minor);
-  return (0);
+	return (0);
 }
 
-
 static int do_md_add (int minor, kdev_t dev)
 {
-  struct gendisk *gen_real;
-  int i;
-  
-  if (MAJOR(dev)==MD_MAJOR || md_dev[minor].nb_dev==MAX_REAL)
-    return -EINVAL;
-  
-  if (!fs_may_mount (dev) || md_dev[minor].pers)
-    return -EBUSY;
-  
-  if (!(gen_real=find_gendisk (dev)))
-    return -ENOENT;
-  
-  i=md_dev[minor].nb_dev++;
-  md_dev[minor].devices[i].dev=dev;
-  
-  /* Lock the device by inserting a dummy inode. This doesn't
-     smell very good, but I need to be consistent with the
-     mount stuff, specially with fs_may_mount. If someone have
-     a better idea, please help ! */
-  
-  md_dev[minor].devices[i].inode=get_empty_inode ();
-  md_dev[minor].devices[i].inode->i_dev=dev; /* don't care about
-						other fields */
-  insert_inode_hash (md_dev[minor].devices[i].inode);
-  
-  /* Sizes are now rounded at run time */
-  
-  md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)];
+	int i;
+	int hot_add=0;
+	struct real_dev *realdev;
+
+	if (md_dev[minor].nb_dev==MAX_REAL)
+		return -EINVAL;
+
+	if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
+		printk("md_add(): zero device size, huh, bailing out.\n");
+		return -EINVAL;
+	}
+
+	if (md_dev[minor].pers) {
+		/*
+		 * The array is already running, hot-add the drive, or
+		 * bail out:
+		 */
+		if (!md_dev[minor].pers->hot_add_disk)
+			return -EBUSY;
+		else
+			hot_add=1;
+	}
+
+	/*
+	 * Careful. We cannot increase nb_dev for a running array.
+	 */
+	i=md_dev[minor].nb_dev;
+	realdev = &md_dev[minor].devices[i];
+	realdev->dev=dev;
+  
+	/* Lock the device by inserting a dummy inode. This doesn't
+	   smell very good, but I need to be consistent with the
+	   mount stuff, specially with fs_may_mount. If someone have
+	   a better idea, please help ! */
+  
+	realdev->inode=get_empty_inode ();
+	realdev->inode->i_dev=dev; 	/* don't care about other fields */
+	insert_inode_hash (realdev->inode);
+  
+	/* Sizes are now rounded at run time */
+  
+/*  md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
+
+	realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
+
+	if (hot_add) {
+		/*
+		 * Check the superblock for consistency.
+		 * the personality itself has to check wether it's getting
+		 * added with the proper flags ... also, personality has to
+		 * be checked too ;)
+		 */
+		if (analyze_one_sb (realdev))
+			return -EINVAL;
+		/*
+		 * hot_add has to bump up nb_dev itself
+		 */
+		if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
+			/*
+			 * FIXME: here we should free up the inode and stuff
+			 */
+			printk ("FIXME\n");
+			return -EINVAL;
+		}
+	} else
+		md_dev[minor].nb_dev++;
 
-  printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
-  return (0);
+	printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
+	return (0);
 }
 
-
 static int md_ioctl (struct inode *inode, struct file *file,
                      unsigned int cmd, unsigned long arg)
 {
@@ -354,7 +691,6 @@
   return (0);
 }
 
-
 static int md_open (struct inode *inode, struct file *file)
 {
   int minor=MINOR(inode->i_rdev);
@@ -427,6 +763,30 @@
   return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
 }
   
+int md_make_request (int minor, int rw, struct buffer_head * bh)
+{
+	if (md_dev [minor].pers->make_request) {
+		if (buffer_locked(bh))
+			return 0;
+		set_bit(BH_Lock, &bh->b_state);
+		if (rw == WRITE || rw == WRITEA) {
+			if (!buffer_dirty(bh)) {
+				bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+				return 0;
+			}
+		}
+		if (rw == READ || rw == READA) {
+			if (buffer_uptodate(bh)) {
+				bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+				return 0;
+			}
+		}
+		return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
+	} else {
+		make_request (MAJOR(bh->b_rdev), rw, bh);
+		return 0;
+	}
+}
 
 static void do_md_request (void)
 {
@@ -434,10 +794,51 @@
   return;
 }  
 
+/*
+ * We run MAX_MD_THREADS from md_init() and arbitrate them in run time.
+ * This is not so elegant, but how can we use kernel_thread() from within
+ * loadable modules?
+ */
+struct md_thread *md_register_thread (void (*run) (void *), void *data)
+{
+	int i;
+	for (i = 0; i < MAX_MD_THREADS; i++) {
+		if (md_threads[i].run == NULL) {
+			md_threads[i].run = run;
+			md_threads[i].data = data;
+			return md_threads + i;
+		}
+	}
+	return NULL;
+}
+
+void md_unregister_thread (struct md_thread *thread)
+{
+	thread->run = NULL;
+	thread->data = NULL;
+	thread->flags = 0;
+}
+
+void md_wakeup_thread(struct md_thread *thread)
+{
+	set_bit(THREAD_WAKEUP, &thread->flags);
+	wake_up(&thread->wqueue);
+}
+
+
 EXPORT_SYMBOL(md_size);
+EXPORT_SYMBOL(md_maxreadahead);
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
 EXPORT_SYMBOL(partition_name);
+EXPORT_SYMBOL(md_dev);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_update_sb);
+EXPORT_SYMBOL(md_map);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_do_sync);
 
 static struct proc_dir_entry proc_md = {
 	PROC_MD, 6, "mdstat",
@@ -451,16 +852,36 @@
   for(i=0;i<MAX_MD_DEV;i++)
   {
     md_blocksizes[i] = 1024;
+    md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
     md_gendisk.part[i].start_sect=-1; /* avoid partition check */
     md_gendisk.part[i].nr_sects=0;
     md_dev[i].pers=NULL;
   }
 
-  blksize_size[MAJOR_NR] = md_blocksizes;
+  blksize_size[MD_MAJOR] = md_blocksizes;
+  max_readahead[MD_MAJOR] = md_maxreadahead;
 
   proc_register(&proc_root, &proc_md);
 }
 
+int md_error (kdev_t mddev, kdev_t rdev)
+{
+    unsigned int minor = MINOR (mddev);
+    int rc;
+
+    if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
+	panic ("md_error gets unknown device\n");
+    if (!md_dev [minor].pers)
+	panic ("md_error gets an error for an unknown device\n");
+    if (md_dev [minor].pers->error_handler) {
+	rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
+#if SUPPORT_RECONSTRUCTION
+	md_wakeup_thread(md_sync_thread);
+#endif /* SUPPORT_RECONSTRUCTION */
+	return rc;
+    }
+    return 0;
+}
 
 int get_md_status (char *page)
 {
@@ -493,9 +914,13 @@
 		   partition_name(md_dev[i].devices[j].dev));
       size+=md_dev[i].devices[j].size;
     }
-    
-    if (md_dev[i].nb_dev)
-      sz+=sprintf (page+sz, " %d blocks", size);
+
+    if (md_dev[i].nb_dev) {
+      if (md_dev[i].pers)
+        sz+=sprintf (page+sz, " %d blocks", md_size[i]);
+      else
+        sz+=sprintf (page+sz, " %d blocks", size);
+    }
 
     if (!md_dev[i].pers)
     {
@@ -506,11 +931,8 @@
     if (md_dev[i].pers->max_invalid_dev)
       sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
 
-    sz+=sprintf (page+sz, " %dk %s\n", 1<<FACTOR_SHIFT(FACTOR(md_dev+i)),
-		 md_dev[i].pers == pers[LINEAR>>PERSONALITY_SHIFT] ?
-		 "rounding" : "chunks");
-
     sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
+    sz+=sprintf (page+sz, "\n");
   }
 
   return (sz);
@@ -543,6 +965,198 @@
   return 0;
 } 
 
+int md_thread(void * arg)
+{
+	struct md_thread *thread = arg;
+
+	current->session = 1;
+	current->pgrp = 1;
+	sprintf(current->comm, "md_thread");
+
+	lock_kernel();
+	for (;;) {
+		sti();
+		clear_bit(THREAD_WAKEUP, &thread->flags);
+		if (thread->run) {
+			thread->run(thread->data);
+			run_task_queue(&tq_disk);
+		}
+		cli();
+		if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
+			do {
+				current->signal = 0;
+				interruptible_sleep_on(&thread->wqueue);
+			} while (current->signal);
+		}
+	}
+}
+
+static md_descriptor_t *get_spare(struct md_dev *mddev)
+{
+	int i;
+	md_superblock_t *sb = mddev->sb;
+	md_descriptor_t *descriptor;
+	struct real_dev *realdev;
+	
+  	for (i = 0; i < mddev->nb_dev; i++) {
+  		realdev = &mddev->devices[i];
+		if (!realdev->sb)
+			continue;
+		descriptor = &sb->disks[realdev->sb->descriptor.number];
+		if (descriptor->state & (1 << MD_FAULTY_DEVICE))
+			continue;
+		if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
+			continue;
+		return descriptor;
+	}
+	return NULL;
+}
+
+/*
+ * parallel resyncing thread. 
+ *
+ * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
+ *        - fix read error handing
+ */
+
+int md_do_sync(struct md_dev *mddev)
+{
+        struct buffer_head *bh;
+	int max_blocks, blocksize, curr_bsize, percent=1, j;
+	kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
+	int major = MAJOR(read_disk), minor = MINOR(read_disk);
+	unsigned long starttime;
+
+	blocksize = blksize_size[major][minor];
+	max_blocks = blk_size[major][minor] / (blocksize >> 10);
+
+	printk("... resync log\n");
+	printk(" ....   mddev->nb_dev: %d\n", mddev->nb_dev);
+	printk(" ....   raid array: %s\n", kdevname(read_disk));
+	printk(" ....   max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
+	printk("md: syncing RAID array %s\n", kdevname(read_disk));
+
+	mddev->busy++;
+
+	starttime=jiffies;
+	for (j = 0; j < max_blocks; j++) {
+
+		/*
+		 * B careful. When some1 mounts a non-'blocksize' filesystem
+		 * then we get the blocksize changed right under us. Go deal
+		 * with it transparently, recalculate 'blocksize', 'j' and
+		 * 'max_blocks':
+		 */
+		curr_bsize = blksize_size[major][minor];
+		if (curr_bsize != blocksize) {
+diff_blocksize:
+			if (curr_bsize > blocksize)
+				/*
+				 * this is safe, rounds downwards.
+				 */
+				j /= curr_bsize/blocksize;
+			else
+				j *= blocksize/curr_bsize;
+
+			blocksize = curr_bsize;
+			max_blocks = blk_size[major][minor] / (blocksize >> 10);
+		}
+        	if ((bh = breada (read_disk, j, blocksize, j * blocksize,
+					max_blocks * blocksize)) != NULL) {
+			mark_buffer_dirty(bh, 1);
+			brelse(bh);
+		} else {
+			/*
+			 * FIXME: Ugly, but set_blocksize() isnt safe ...
+			 */
+			curr_bsize = blksize_size[major][minor];
+			if (curr_bsize != blocksize)
+				goto diff_blocksize;
+
+			/*
+			 * It's a real read problem. FIXME, handle this
+			 * a better way.
+			 */
+			printk ( KERN_ALERT
+				 "read error, stopping reconstruction.\n");
+			mddev->busy--;
+			return 1;
+		}
+
+		/*
+		 * Lets sleep some if we are faster than our speed limit:
+		 */
+		while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
+		{
+			current->state = TASK_INTERRUPTIBLE;
+			current->timeout = jiffies+1;
+			schedule();
+		}
+
+		/*
+		 * FIXME: put this status bar thing into /proc
+		 */
+		if (!(j%(max_blocks/100))) {
+			if (!(percent%10))
+				printk (" %03d%% done.\n",percent);
+			else
+				printk (".");
+			percent++;
+		}
+	}
+	fsync_dev(read_disk);
+	printk("md: %s: sync done.\n", kdevname(read_disk));
+	mddev->busy--;
+	return 0;
+}
+
+/*
+ * This is a kernel thread which: syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void mdsyncd (void *data)
+{
+	int i;
+	struct md_dev *mddev;
+	md_superblock_t *sb;
+	md_descriptor_t *spare;
+	unsigned long flags;
+
+	for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
+		if ((sb = mddev->sb) == NULL)
+			continue;
+		if (sb->active_disks == sb->raid_disks)
+			continue;
+		if (!sb->spare_disks)
+			continue;
+		if ((spare = get_spare(mddev)) == NULL)
+			continue;
+		if (!mddev->pers->mark_spare)
+			continue;
+		if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
+			continue;
+		if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
+			mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
+			continue;
+		}
+		save_flags(flags);
+		cli();
+		mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
+		spare->state |= (1 << MD_SYNC_DEVICE);
+		spare->state |= (1 << MD_ACTIVE_DEVICE);
+		sb->spare_disks--;
+		sb->active_disks++;
+		mddev->sb_dirty = 1;
+		md_update_sb(mddev - md_dev);
+		restore_flags(flags);
+	}
+	
+}
+
 void linear_init (void);
 void raid0_init (void);
 void raid1_init (void);
@@ -550,7 +1164,11 @@
 
 __initfunc(int md_init (void))
 {
-  printk ("md driver %s MAX_MD_DEV=%d, MAX_REAL=%d\n", MD_VERSION, MAX_MD_DEV, MAX_REAL);
+  int i;
+
+  printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
+    MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
+    MAX_MD_DEV, MAX_REAL);
 
   if (register_blkdev (MD_MAJOR, "md", &md_fops))
   {
@@ -558,18 +1176,39 @@
     return (-1);
   }
 
+  memset(md_threads, 0, MAX_MD_THREADS * sizeof(struct md_thread));
+  printk("md: starting %d kernel threads\n", MAX_MD_THREADS);
+  for (i = 0; i < MAX_MD_THREADS; i++) {
+    md_threads[i].run = NULL;
+    init_waitqueue(&md_threads[i].wqueue);
+    md_threads[i].flags = 0;
+    kernel_thread (md_thread, md_threads + i, 0);
+  }
+
   blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
   blk_dev[MD_MAJOR].current_request=NULL;
   read_ahead[MD_MAJOR]=INT_MAX;
+  memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
   md_gendisk.next=gendisk_head;
 
   gendisk_head=&md_gendisk;
 
+#if SUPPORT_RECONSTRUCTION
+  if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
+    printk("md: bug: md_sync_thread == NULL\n");
+#endif /* SUPPORT_RECONSTRUCTION */
+
 #ifdef CONFIG_MD_LINEAR
   linear_init ();
 #endif
 #ifdef CONFIG_MD_STRIPED
   raid0_init ();
+#endif
+#ifdef CONFIG_MD_MIRRORING
+  raid1_init ();
+#endif
+#ifdef CONFIG_MD_RAID5
+  raid5_init ();
 #endif
   
   return (0);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov