From: Andrey Borzenkov <arvidjaar@mail.ru>

Oops is due to concurrent d_instantiate on the same dentry; the bug was
unfortunately quite ugly to fix inside devfs itself.

The attached patch makes sure d_revalidate is always called under parent
i_sem allowing it to drop and reacquire semaphore before going to wait.  It
provides both mutual exclusion with devfs_lookup and between d_revalidate,
fixing

- this bug; unfortunately I do not know how to reproduce it on purpose. 
  It apparently needs at least true SMP that I do not have.  We need two
  d_revalidate's against the same dentry running concurrently

- old devfs_lookup/d_revalidate deadlock (which has been fixed a bit 

- theoretically possible problem when dentry->d_op is changed after
  d_op->d_revalidate has been tested resulting in NULL pointer
  dereferencing (if (dentry->d_op->d_revalidate)
  dentry->d_op->d_revalidate).  I am not even sure if it is really
  possible.

Pavel, you have been lucky in cathing devfs bugs, could you please test this 
if it works for you?

I appreciate comments about fs/namei.c changes.  I tried to make them as
non-intrusive as possible.  Believe me - it is the most simple way to close
devfs races.

With this resolved we can start cleaning devfs; my final goal is to
autoremove unneeded path components and get rid of devfs_name alltogether. 
Now when every driver has kernel name it is enough to register using this
one letting devfsd to do the rest.



 fs/devfs/base.c    |   62 ++++++++++++++++++++++++-----------------------------
 fs/namei.c         |   21 +++++++++++++++--
 include/linux/fs.h |    1 
 3 files changed, 48 insertions(+), 36 deletions(-)

diff -puN fs/devfs/base.c~devfs-d_revalidate-oops-fix fs/devfs/base.c
--- 25/fs/devfs/base.c~devfs-d_revalidate-oops-fix	2004-01-04 13:43:13.000000000 -0800
+++ 25-akpm/fs/devfs/base.c	2004-01-04 13:43:13.000000000 -0800
@@ -2171,17 +2171,9 @@ static void devfs_d_iput (struct dentry 
 }   /*  End Function devfs_d_iput  */
 
 static int devfs_d_delete (struct dentry *dentry);
-
-static struct dentry_operations devfs_dops =
-{
-    .d_delete     = devfs_d_delete,
-    .d_release    = devfs_d_release,
-    .d_iput       = devfs_d_iput,
-};
-
 static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *);
 
-static struct dentry_operations devfs_wait_dops =
+static struct dentry_operations devfs_dops =
 {
     .d_delete     = devfs_d_delete,
     .d_release    = devfs_d_release,
@@ -2198,8 +2190,8 @@ static int devfs_d_delete (struct dentry
 {
     struct inode *inode = dentry->d_inode;
 
-    if (dentry->d_op == &devfs_wait_dops) dentry->d_op = &devfs_dops;
     /*  Unhash dentry if negative (has no inode)  */
+    /* FIXME should we check for d_fsdata? */
     if (inode == NULL)
     {
 	DPRINTK (DEBUG_D_DELETE, "(%p): dropping negative dentry\n", dentry);
@@ -2216,6 +2208,11 @@ struct devfs_lookup_struct
 
 /* XXX: this doesn't handle the case where we got a negative dentry
         but a devfs entry has been registered in the meanwhile */
+/*
+ * This relies on the fact that d_revalidate is called under parent i_sem
+ * providing mutual exclusion with devfs_lookup and protection for
+ * dentry->d_fsdata
+ */
 static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *nd)
 {
     struct inode *dir = dentry->d_parent->d_inode;
@@ -2224,6 +2221,10 @@ static int devfs_d_revalidate_wait (stru
     struct devfs_lookup_struct *lookup_info = dentry->d_fsdata;
     DECLARE_WAITQUEUE (wait, current);
 
+    /* Ordinary case - nothing to revalidate */
+    if (lookup_info == NULL) return 1;  /*  Early termination  */
+
+    /* Called from devfsd action - cannot block. */
     if ( is_devfsd_or_child (fs_info) )
     {
 	devfs_handle_t de = lookup_info->de;
@@ -2252,25 +2253,22 @@ static int devfs_d_revalidate_wait (stru
 	d_instantiate (dentry, inode);
 	return 1;
     }
-    if (lookup_info == NULL) return 1;  /*  Early termination  */
-    read_lock (&parent->u.dir.lock);
-    if (dentry->d_fsdata)
-    {
-	set_current_state (TASK_UNINTERRUPTIBLE);
-	add_wait_queue (&lookup_info->wait_queue, &wait);
-	read_unlock (&parent->u.dir.lock);
-	schedule ();
-	/*
-	 * This does not need nor should remove wait from wait_queue.
-	 * Wait queue head is never reused - nothing is ever added to it
-	 * after all waiters have been waked up and head itself disappears
-	 * very soon after it. Moreover it is local variable on stack that
-	 * is likely to have already disappeared so any reference to it
-	 * at this point is buggy.
-	 */
 
-    }
-    else read_unlock (&parent->u.dir.lock);
+    /* Not devfsd - must wait for devfsd to return */
+    set_current_state (TASK_UNINTERRUPTIBLE);
+    add_wait_queue (&lookup_info->wait_queue, &wait);
+    up(&dir->i_sem);
+    schedule ();
+    down(&dir->i_sem);
+    /*
+     * This does not need nor should remove wait from wait_queue.
+     * Wait queue head is never reused - nothing is ever added to it
+     * after all waiters have been waked up and head itself disappears
+     * very soon after it. Moreover it is local variable on stack that
+     * is likely to have already disappeared so any reference to it
+     * at this point is buggy.
+     */
+
     return 1;
 }   /*  End Function devfs_d_revalidate_wait  */
 
@@ -2309,17 +2307,18 @@ static struct dentry *devfs_lookup (stru
 	if (try_modload (parent, fs_info,
 			 dentry->d_name.name, dentry->d_name.len, &tmp) < 0)
 	{   /*  Lookup event was not queued to devfsd  */
+	    dentry->d_fsdata = NULL;
 	    d_add (dentry, NULL);
 	    return NULL;
 	}
     }
-    dentry->d_op = &devfs_wait_dops;
     d_add (dentry, NULL);  /*  Open the floodgates  */
     /*  Unlock directory semaphore, which will release any waiters. They
 	will get the hashed dentry, and may be forced to wait for
 	revalidation  */
     up (&dir->i_sem);
     wait_for_devfsd_finished (fs_info);  /*  If I'm not devfsd, must wait  */
+    down (&dir->i_sem);      /*  Grab it again because them's the rules  */
     de = lookup_info.de;
     /*  If someone else has been so kind as to make the inode, we go home
 	early  */
@@ -2344,12 +2343,8 @@ static struct dentry *devfs_lookup (stru
 	     de->name, de->inode.ino, inode, de, current->comm);
     d_instantiate (dentry, inode);
 out:
-    write_lock (&parent->u.dir.lock);
-    dentry->d_op = &devfs_dops;
     dentry->d_fsdata = NULL;
     wake_up (&lookup_info.wait_queue);
-    write_unlock (&parent->u.dir.lock);
-    down (&dir->i_sem);      /*  Grab it again because them's the rules  */
     devfs_put (de);
     return retval;
 }   /*  End Function devfs_lookup  */
@@ -2592,6 +2587,7 @@ static struct file_system_type devfs_fs_
     .name	= DEVFS_NAME,
     .get_sb	= devfs_get_sb,
     .kill_sb	= kill_anon_super,
+    .fs_flags	= FS_ODD_REVALIDATE,
 };
 
 /*  File operations for devfsd follow  */
diff -puN fs/namei.c~devfs-d_revalidate-oops-fix fs/namei.c
--- 25/fs/namei.c~devfs-d_revalidate-oops-fix	2004-01-04 13:43:13.000000000 -0800
+++ 25-akpm/fs/namei.c	2004-01-04 13:43:13.000000000 -0800
@@ -274,6 +274,9 @@ void path_release(struct nameidata *nd)
  * Internal lookup() using the new generic dcache.
  * SMP-safe
  */
+/*
+ * This seems to always be called under parent->i_sem locked
+ */
 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 {
 	struct dentry * dentry = __d_lookup(parent, name);
@@ -342,6 +345,7 @@ static struct dentry * real_lookup(struc
 {
 	struct dentry * result;
 	struct inode *dir = parent->d_inode;
+	int needlock = dir->i_sb->s_type->fs_flags & FS_ODD_REVALIDATE;
 
 	down(&dir->i_sem);
 	/*
@@ -377,13 +381,16 @@ static struct dentry * real_lookup(struc
 	 * Uhhuh! Nasty case: the cache was re-populated while
 	 * we waited on the semaphore. Need to revalidate.
 	 */
-	up(&dir->i_sem);
+	if (!needlock)
+		up(&dir->i_sem);
 	if (result->d_op && result->d_op->d_revalidate) {
 		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 			dput(result);
 			result = ERR_PTR(-ENOENT);
 		}
 	}
+	if (needlock)
+		up(&dir->i_sem);
 	return result;
 }
 
@@ -528,11 +535,17 @@ static int do_lookup(struct nameidata *n
 {
 	struct vfsmount *mnt = nd->mnt;
 	struct dentry *dentry = __d_lookup(nd->dentry, name);
+	int needlock = mnt->mnt_sb->s_type->fs_flags & FS_ODD_REVALIDATE;
 
 	if (!dentry)
 		goto need_lookup;
+	if (needlock)
+		down(&nd->dentry->d_inode->i_sem);
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto need_revalidate;
+unlock:
+	if (needlock)
+		up(&nd->dentry->d_inode->i_sem);
 done:
 	path->mnt = mnt;
 	path->dentry = dentry;
@@ -546,9 +559,11 @@ need_lookup:
 
 need_revalidate:
 	if (dentry->d_op->d_revalidate(dentry, nd))
-		goto done;
+		goto unlock;
 	if (d_invalidate(dentry))
-		goto done;
+		goto unlock;
+	if (needlock)
+		up(&nd->dentry->d_inode->i_sem);
 	dput(dentry);
 	goto need_lookup;
 
diff -puN include/linux/fs.h~devfs-d_revalidate-oops-fix include/linux/fs.h
--- 25/include/linux/fs.h~devfs-d_revalidate-oops-fix	2004-01-04 13:43:13.000000000 -0800
+++ 25-akpm/include/linux/fs.h	2004-01-04 13:43:13.000000000 -0800
@@ -93,6 +93,7 @@ extern int leases_enable, dir_notify_ena
 #define FS_ODD_RENAME	32768	/* Temporary stuff; will go away as soon
 				  * as nfs_rename() will be cleaned up
 				  */
+#define FS_ODD_REVALIDATE	(1<<16) /* d_revalidate needs lock on i_sem */
 /*
  * These are the fs-independent mount-flags: up to 32 flags are supported
  */

_