patch-2.1.36 linux/fs/inode.c

Next file: linux/fs/isofs/dir.c
Previous file: linux/fs/hpfs/hpfs_fs.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.35/linux/fs/inode.c linux/fs/inode.c
@@ -1,231 +1,234 @@
 /*
- *  linux/fs/inode.c
+ * linux/fs/inode.c: Keeping track of inodes.
  *
- *  Copyright (C) 1991, 1992  Linus Torvalds
+ * Copyright (C) 1991, 1992  Linus Torvalds
+ * Copyright (C) 1997 David S. Miller
  */
 
-#include <linux/stat.h>
-#include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 
-#include <asm/system.h>
-
-#define NR_IHASH 512
-
-/*
- * Be VERY careful when you access the inode hash table. There
- * are some rather scary race conditions you need to take care of:
- *  - P1 tries to open file "xx", calls "iget()" with the proper
- *    inode number, but blocks because it's not on the list.
- *  - P2 deletes file "xx", gets the inode (which P1 has just read,
- *    but P1 hasn't woken up to the fact yet)
- *  - P2 iput()'s the inode, which now has i_nlink = 0
- *  - P1 wakes up and has the inode, but now P2 has made that
- *    inode invalid (but P1 has no way of knowing that).
- *
- * The "updating" counter makes sure that when P1 blocks on the
- * iget(), P2 can't delete the inode from under it because P2
- * will wait until P1 has been able to update the inode usage
- * count so that the inode will stay in use until everybody has
- * closed it..
- */
-static struct inode_hash_entry {
-	struct inode * inode;
-	int updating;
-} hash_table[NR_IHASH];
-
-static struct inode * first_inode;
-static struct wait_queue * inode_wait = NULL;
-/* Keep these next two contiguous in memory for sysctl.c */
 int nr_inodes = 0, nr_free_inodes = 0;
 int max_inodes = NR_INODE;
 
-static inline int const hashfn(kdev_t dev, unsigned int i)
-{
-	return (HASHDEV(dev) ^ i) % NR_IHASH;
-}
+#define INODE_HASHSZ	1024
 
-static inline struct inode_hash_entry * const hash(kdev_t dev, int i)
-{
-	return hash_table + hashfn(dev, i);
-}
+static struct inode *inode_hash[INODE_HASHSZ];
 
-static inline void insert_inode_free(struct inode *inode)
-{
-	struct inode * prev, * next = first_inode;
+/* All the details of hashing and lookup. */
+#define hashfn(dev, i) ((HASHDEV(dev) + ((i) ^ ((i) >> 10))) & (INODE_HASHSZ - 1))
 
-	first_inode = inode;
-	prev = next->i_prev;
-	inode->i_next = next;
-	inode->i_prev = prev;
-	prev->i_next = inode;
-	next->i_prev = inode;
+__inline__ void insert_inode_hash(struct inode *inode)
+{
+	struct inode **htable = &inode_hash[hashfn(inode->i_dev, inode->i_ino)];
+	if((inode->i_hash_next = *htable) != NULL)
+		(*htable)->i_hash_pprev = &inode->i_hash_next;
+	*htable = inode;
+	inode->i_hash_pprev = htable;
 }
 
-static inline void remove_inode_free(struct inode *inode)
+#define hash_inode(inode) insert_inode_hash(inode)
+
+static inline void unhash_inode(struct inode *inode)
 {
-	if (first_inode == inode)
-		first_inode = first_inode->i_next;
-	if (inode->i_next)
-		inode->i_next->i_prev = inode->i_prev;
-	if (inode->i_prev)
-		inode->i_prev->i_next = inode->i_next;
-	inode->i_next = inode->i_prev = NULL;
+	if(inode->i_hash_pprev) {
+		if(inode->i_hash_next)
+			inode->i_hash_next->i_hash_pprev = inode->i_hash_pprev;
+		*(inode->i_hash_pprev) = inode->i_hash_next;
+		inode->i_hash_pprev = NULL;
+	}
 }
 
-void insert_inode_hash(struct inode *inode)
+static inline struct inode *find_inode(unsigned int hashent,
+				       kdev_t dev, unsigned long ino)
 {
-	struct inode_hash_entry *h;
-	h = hash(inode->i_dev, inode->i_ino);
+	struct inode *inode;
 
-	inode->i_hash_next = h->inode;
-	inode->i_hash_prev = NULL;
-	if (inode->i_hash_next)
-		inode->i_hash_next->i_hash_prev = inode;
-	h->inode = inode;
+	for(inode = inode_hash[hashent]; inode; inode = inode->i_hash_next)
+		if(inode->i_dev == dev && inode->i_ino == ino)
+			break;
+	return inode;
 }
 
-static inline void remove_inode_hash(struct inode *inode)
-{
-	struct inode_hash_entry *h;
-	h = hash(inode->i_dev, inode->i_ino);
-
-	if (h->inode == inode)
-		h->inode = inode->i_hash_next;
-	if (inode->i_hash_next)
-		inode->i_hash_next->i_hash_prev = inode->i_hash_prev;
-	if (inode->i_hash_prev)
-		inode->i_hash_prev->i_hash_next = inode->i_hash_next;
-	inode->i_hash_prev = inode->i_hash_next = NULL;
+/* Free list queue and management. */
+static struct free_inode_queue {
+	struct inode *head;
+	struct inode **last;
+} free_inodes = { NULL, &free_inodes.head };
+
+static inline void put_inode_head(struct inode *inode)
+{
+	if((inode->i_next = free_inodes.head) != NULL)
+		free_inodes.head->i_pprev = &inode->i_next;
+	else
+		free_inodes.last = &inode->i_next;
+	free_inodes.head = inode;
+	inode->i_pprev = &free_inodes.head;
+	nr_free_inodes++;
 }
 
-static inline void put_last_free(struct inode *inode)
+static inline void put_inode_last(struct inode *inode)
 {
-	remove_inode_free(inode);
-	inode->i_prev = first_inode->i_prev;
-	inode->i_prev->i_next = inode;
-	inode->i_next = first_inode;
-	inode->i_next->i_prev = inode;
+	inode->i_next = NULL;
+	inode->i_pprev = free_inodes.last;
+	*free_inodes.last = inode;
+	free_inodes.last = &inode->i_next;
+	nr_free_inodes++;
 }
 
-int grow_inodes(void)
+static inline void remove_free_inode(struct inode *inode)
 {
-	struct inode * inode;
-	int i;
-
-	if (!(inode = (struct inode*) get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
-
-	i=PAGE_SIZE / sizeof(struct inode);
-	nr_inodes += i;
-	nr_free_inodes += i;
+	if(inode->i_pprev) {
+		if(inode->i_next)
+			inode->i_next->i_pprev = inode->i_pprev;
+		else
+			free_inodes.last = inode->i_pprev;
+		*inode->i_pprev = inode->i_next;
+		inode->i_pprev = NULL;
+		nr_free_inodes--;
+	}
+}
 
-	if (!first_inode)
-		inode->i_next = inode->i_prev = first_inode = inode++, i--;
+/* This is the in-use queue, if i_count > 0 (as far as we can tell)
+ * the sucker is here.
+ */
+static struct inode *inuse_list = NULL;
 
-	for ( ; i ; i-- )
-		insert_inode_free(inode++);
-	return 0;
+static inline void put_inuse(struct inode *inode)
+{
+	if((inode->i_next = inuse_list) != NULL)
+		inuse_list->i_pprev = &inode->i_next;
+	inuse_list = inode;
+	inode->i_pprev = &inuse_list;
 }
 
-unsigned long inode_init(unsigned long start, unsigned long end)
+static inline void remove_inuse(struct inode *inode)
 {
-	memset(hash_table, 0, sizeof(hash_table));
-	first_inode = NULL;
-	return start;
+	if(inode->i_pprev) {
+		if(inode->i_next)
+			inode->i_next->i_pprev = inode->i_pprev;
+		*inode->i_pprev = inode->i_next;
+		inode->i_pprev = NULL;
+	}
 }
 
+/* Locking and unlocking inodes, plus waiting for locks to clear. */
 static void __wait_on_inode(struct inode *);
 
-static inline void wait_on_inode(struct inode * inode)
+static inline void wait_on_inode(struct inode *inode)
 {
-	if (inode->i_lock)
+	if(inode->i_lock)
 		__wait_on_inode(inode);
 }
 
-static inline void lock_inode(struct inode * inode)
+static inline void lock_inode(struct inode *inode)
 {
-	wait_on_inode(inode);
+	if(inode->i_lock)
+		__wait_on_inode(inode);
 	inode->i_lock = 1;
 }
 
-static inline void unlock_inode(struct inode * inode)
+static inline void unlock_inode(struct inode *inode)
 {
 	inode->i_lock = 0;
 	wake_up(&inode->i_wait);
 }
 
-/*
- * Note that we don't want to disturb any wait-queues when we discard
- * an inode.
- *
- * Argghh. Got bitten by a gcc problem with inlining: no way to tell
- * the compiler that the inline asm function 'memset' changes 'inode'.
- * I've been searching for the bug for days, and was getting desperate.
- * Finally looked at the assembler output... Grrr.
- *
- * The solution is the weird use of 'volatile'. Ho humm. Have to report
- * it to the gcc lists, and hope we can do this more cleanly some day..
- */
-void clear_inode(struct inode * inode)
+static void __wait_on_inode(struct inode * inode)
+{
+	struct wait_queue wait = { current, NULL };
+
+	add_wait_queue(&inode->i_wait, &wait);
+repeat:
+	current->state = TASK_UNINTERRUPTIBLE;
+	if (inode->i_lock) {
+		schedule();
+		goto repeat;
+	}
+	remove_wait_queue(&inode->i_wait, &wait);
+	current->state = TASK_RUNNING;
+}
+
+/* Clear an inode of all it's identity, this is exported to the world. */
+void clear_inode(struct inode *inode)
 {
-	struct wait_queue * wait;
+	struct wait_queue *wait;
 
+	/* So we don't disappear. */
 	inode->i_count++;
+
 	truncate_inode_pages(inode, 0);
 	wait_on_inode(inode);
-	if (IS_WRITABLE(inode)) {
-		if (inode->i_sb && inode->i_sb->dq_op)
-			inode->i_sb->dq_op->drop(inode);
-	}
-	remove_inode_hash(inode);
-	remove_inode_free(inode);
-	wait = ((volatile struct inode *) inode)->i_wait;
-	inode->i_count--;
-	if (inode->i_count)
-		nr_free_inodes++;
-	memset(inode,0,sizeof(*inode));
-	((volatile struct inode *) inode)->i_wait = wait;
-	insert_inode_free(inode);
-}
+	if(IS_WRITABLE(inode) && inode->i_sb && inode->i_sb->dq_op)
+		inode->i_sb->dq_op->drop(inode);
 
+	if(--inode->i_count > 0)
+		remove_inuse(inode);
+	else
+		remove_free_inode(inode);
+	unhash_inode(inode);
+	wait = inode->i_wait;
+	memset(inode, 0, sizeof(*inode)); barrier();
+	inode->i_wait = wait;
+	put_inode_head(inode);	/* Pages zapped, put at the front. */
+}
+
+/* These check the validity of a mount/umount type operation, we essentially
+ * check if there are any inodes hanging around which prevent this operation
+ * from occurring.  We also clear out clean inodes referencing this device.
+ */
 int fs_may_mount(kdev_t dev)
 {
-	struct inode * inode, * next;
-	int i;
+	struct inode *inode;
+	int pass = 0;
 
-	next = first_inode;
-	for (i = nr_inodes ; i > 0 ; i--) {
-		inode = next;
-		next = inode->i_next;	/* clear_inode() changes the queues.. */
-		if (inode->i_dev != dev)
-			continue;
-		if (inode->i_count || inode->i_dirt || inode->i_lock)
+	inode = free_inodes.head;
+repeat:
+	while(inode) {
+		struct inode *next = inode->i_next;
+		if(inode->i_dev != dev)
+			goto next;
+		if(inode->i_count || inode->i_dirt || inode->i_lock)
 			return 0;
 		clear_inode(inode);
+	next:
+		inode = next;
+	}
+	if(pass == 0) {
+		inode = inuse_list;
+		pass = 1;
+		goto repeat;
 	}
-	return 1;
+	return 1; /* Tis' cool bro. */
 }
 
-int fs_may_umount(kdev_t dev, struct inode * mount_root)
+int fs_may_umount(kdev_t dev, struct inode *iroot)
 {
-	struct inode * inode;
-	int i;
+	struct inode *inode;
+	int pass = 0;
 
-	inode = first_inode;
-	for (i=0 ; i < nr_inodes ; i++, inode = inode->i_next) {
-		if (inode->i_dev != dev || !inode->i_count)
+	inode = free_inodes.head;
+repeat:
+	for(; inode; inode = inode->i_next) {
+		if(inode->i_dev != dev || !inode->i_count)
 			continue;
-		if (inode == mount_root && inode->i_count ==
-		    (inode->i_mount != inode ? 1 : 2))
+		if(inode == iroot &&
+		   (inode->i_count == (inode->i_mount == inode ? 2 : 1)))
 			continue;
 		return 0;
 	}
-	return 1;
+	if(pass == 0) {
+		inode = inuse_list;
+		pass = 1;
+		goto repeat;
+	}
+	return 1; /* Tis' cool bro. */
 }
 
+/* This belongs in file_table.c, not here... */
 int fs_may_remount_ro(kdev_t dev)
 {
 	struct file * file;
@@ -239,79 +242,70 @@
 		if (S_ISREG(file->f_inode->i_mode) && (file->f_mode & 2))
 			return 0;
 	}
-	return 1;
+	return 1; /* Tis' cool bro. */
 }
 
-static void write_inode(struct inode * inode)
+/* Reading/writing inodes. */
+static void write_inode(struct inode *inode)
 {
-	if (!inode->i_dirt)
-		return;
-	wait_on_inode(inode);
-	if (!inode->i_dirt)
-		return;
-	if (!inode->i_sb || !inode->i_sb->s_op || !inode->i_sb->s_op->write_inode) {
-		inode->i_dirt = 0;
-		return;
+	if(inode->i_dirt) {
+		wait_on_inode(inode);
+		if(inode->i_dirt) {
+			if(inode->i_sb		&&
+			   inode->i_sb->s_op	&&
+			   inode->i_sb->s_op->write_inode) {
+				inode->i_lock = 1;
+				inode->i_sb->s_op->write_inode(inode);
+				unlock_inode(inode);
+			} else {
+				inode->i_dirt = 0;
+			}
+		}
 	}
-	inode->i_lock = 1;	
-	inode->i_sb->s_op->write_inode(inode);
-	unlock_inode(inode);
 }
 
-static inline void read_inode(struct inode * inode)
+static inline void read_inode(struct inode *inode)
 {
-	lock_inode(inode);
-	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->read_inode)
+	if(inode->i_sb		&&
+	   inode->i_sb->s_op	&&
+	   inode->i_sb->s_op->read_inode) {
+		lock_inode(inode);
 		inode->i_sb->s_op->read_inode(inode);
-	unlock_inode(inode);
+		unlock_inode(inode);
+	}
 }
 
-/* POSIX UID/GID verification for setting inode attributes */
 int inode_change_ok(struct inode *inode, struct iattr *attr)
 {
-	/*
-	 *	If force is set do it anyway.
-	 */
-	 
-	if (attr->ia_valid & ATTR_FORCE)
-		return 0;
-
-	/* Make sure a caller can chown */
-	if ((attr->ia_valid & ATTR_UID) &&
-	    (current->fsuid != inode->i_uid ||
-	     attr->ia_uid != inode->i_uid) && !fsuser())
-		return -EPERM;
-
-	/* Make sure caller can chgrp */
-	if ((attr->ia_valid & ATTR_GID) &&
-	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid) &&
-	    !fsuser())
-		return -EPERM;
+	if(!(attr->ia_valid & ATTR_FORCE)) {
+		unsigned short fsuid = current->fsuid;
+		uid_t iuid = inode->i_uid;
+		int not_fsuser = !fsuser();
+
+		if(((attr->ia_valid & ATTR_UID) &&
+		    ((fsuid != iuid) ||
+		     (attr->ia_uid != iuid)) && not_fsuser)	||
+
+		   ((attr->ia_valid & ATTR_GID) &&
+		    (!in_group_p(attr->ia_gid) &&
+		     (attr->ia_gid != inode->i_gid)))		||
 
-	/* Make sure a caller can chmod */
-	if (attr->ia_valid & ATTR_MODE) {
-		if ((current->fsuid != inode->i_uid) && !fsuser())
+		   ((attr->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)) &&
+		    (fsuid != iuid) && not_fsuser))
 			return -EPERM;
-		/* Also check the setgid bit! */
-		if (!fsuser() && !in_group_p((attr->ia_valid & ATTR_GID) ? attr->ia_gid :
-					     inode->i_gid))
-			attr->ia_mode &= ~S_ISGID;
-	}
 
-	/* Check for setting the inode time */
-	if ((attr->ia_valid & ATTR_ATIME_SET) &&
-	    ((current->fsuid != inode->i_uid) && !fsuser()))
-		return -EPERM;
-	if ((attr->ia_valid & ATTR_MTIME_SET) &&
-	    ((current->fsuid != inode->i_uid) && !fsuser()))
-		return -EPERM;
+		if(attr->ia_valid & ATTR_MODE) {
+			gid_t grp;
+			if(fsuid != iuid && not_fsuser)
+				return -EPERM;
+			grp = attr->ia_valid & ATTR_GID ? attr->ia_gid : inode->i_gid;
+			if(not_fsuser && !in_group_p(grp))
+				attr->ia_mode &= ~S_ISGID;
+		}
+	}
 	return 0;
 }
 
-/*
- * Set the appropriate attributes from an attribute structure into
- * the inode structure.
- */
 void inode_setattr(struct inode *inode, struct iattr *attr)
 {
 	if (attr->ia_valid & ATTR_UID)
@@ -334,17 +328,8 @@
 	inode->i_dirt = 1;
 }
 
-/*
- * notify_change is called for inode-changing operations such as
- * chown, chmod, utime, and truncate.  It is guaranteed (unlike
- * write_inode) to be called from the context of the user requesting
- * the change.
- */
-
-int notify_change(struct inode * inode, struct iattr *attr)
+int notify_change(struct inode *inode, struct iattr *attr)
 {
-	int retval;
-
 	attr->ia_ctime = CURRENT_TIME;
 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
 		if (!(attr->ia_valid & ATTR_ATIME_SET))
@@ -353,303 +338,320 @@
 			attr->ia_mtime = attr->ia_ctime;
 	}
 
-	if (inode->i_sb && inode->i_sb->s_op  &&
+	if (inode->i_sb		&&
+	    inode->i_sb->s_op	&&
 	    inode->i_sb->s_op->notify_change) 
 		return inode->i_sb->s_op->notify_change(inode, attr);
 
-	if ((retval = inode_change_ok(inode, attr)) != 0)
-		return retval;
+	if(inode_change_ok(inode, attr) != 0)
+		return -EPERM;
 
 	inode_setattr(inode, attr);
 	return 0;
 }
 
-/*
- * bmap is needed for demand-loading and paging: if this function
- * doesn't exist for a filesystem, then those things are impossible:
- * executables cannot be run from the filesystem etc...
- *
- * This isn't as bad as it sounds: the read-routines might still work,
- * so the filesystem would be otherwise ok (for example, you might have
- * a DOS filesystem, which doesn't lend itself to bmap very well, but
- * you could still transfer files to/from the filesystem)
- */
-int bmap(struct inode * inode, int block)
+int bmap(struct inode *inode, int block)
 {
-	if (inode->i_op && inode->i_op->bmap)
-		return inode->i_op->bmap(inode,block);
+	if(inode->i_op && inode->i_op->bmap)
+		return inode->i_op->bmap(inode, block);
 	return 0;
 }
 
 void invalidate_inodes(kdev_t dev)
 {
-	struct inode * inode, * next;
-	int i;
+	struct inode *inode;
+	int pass = 0;
 
-	next = first_inode;
-	for(i = nr_inodes ; i > 0 ; i--) {
-		inode = next;
-		next = inode->i_next;		/* clear_inode() changes the queues.. */
-		if (inode->i_dev != dev)
-			continue;
-		if (inode->i_count || inode->i_dirt || inode->i_lock) {
-			printk("VFS: inode busy on removed device %s\n",
-			       kdevname(dev));
-			continue;
-		}
+	inode = free_inodes.head;
+repeat:
+	while(inode) {
+		struct inode *next = inode->i_next;
+		if(inode->i_dev != dev)
+			goto next;
 		clear_inode(inode);
+	next:
+		inode = next;
+	}
+	if(pass == 0) {
+		inode = inuse_list;
+		pass = 1;
+		goto repeat;
 	}
 }
 
 void sync_inodes(kdev_t dev)
 {
-	int i;
-	struct inode * inode;
+	struct inode *inode;
+	int pass = 0;
 
-	inode = first_inode;
-	for(i = 0; i < nr_inodes*2; i++, inode = inode->i_next) {
-		if (dev && inode->i_dev != dev)
-			continue;
+	inode = free_inodes.head;
+repeat:
+	while(inode) {
+		struct inode *next = inode->i_next;
+		if(dev && inode->i_dev != dev)
+			goto next;
 		wait_on_inode(inode);
-		if (inode->i_dirt)
-			write_inode(inode);
+		write_inode(inode);
+	next:
+		inode = next;
+	}
+	if(pass == 0) {
+		inode = inuse_list;
+		pass = 1;
+		goto repeat;
 	}
 }
 
-void iput(struct inode * inode)
+static struct wait_queue *inode_wait, *update_wait;
+
+void iput(struct inode *inode)
 {
-	if (!inode)
+	if(!inode)
 		return;
 	wait_on_inode(inode);
-	if (!inode->i_count) {
-		printk("VFS: iput: trying to free free inode\n");
-		printk("VFS: device %s, inode %lu, mode=0%07o\n",
-			kdevname(inode->i_rdev), inode->i_ino, inode->i_mode);
+	if(!inode->i_count) {
+		printk("VFS: Freeing free inode, tell DaveM\n");
 		return;
 	}
-	if (inode->i_pipe)
+	if(inode->i_pipe)
 		wake_up_interruptible(&PIPE_WAIT(*inode));
-repeat:
-	if (inode->i_count>1) {
+we_slept:
+	if(inode->i_count > 1) {
 		inode->i_count--;
-		return;
+	} else {
+		wake_up(&inode_wait);
+		if(inode->i_pipe) {
+			free_page((unsigned long)PIPE_BASE(*inode));
+			PIPE_BASE(*inode) = NULL;
+		}
+		if(inode->i_sb		&&
+		   inode->i_sb->s_op	&&
+		   inode->i_sb->s_op->put_inode) {
+			inode->i_sb->s_op->put_inode(inode);
+			if(!inode->i_nlink)
+				return;
+		}
+		if(inode->i_dirt) {
+			write_inode(inode);
+			wait_on_inode(inode);
+			goto we_slept;
+		}
+		if(IS_WRITABLE(inode)		&&
+		   inode->i_sb			&&
+		   inode->i_sb->dq_op) {
+			inode->i_lock = 1;
+			inode->i_sb->dq_op->drop(inode);
+			unlock_inode(inode);
+			goto we_slept;
+		}
+		/* There is a serious race leading to here, watch out. */
+		if(--inode->i_count == 0) {
+			remove_inuse(inode);
+			put_inode_last(inode);	/* Place at end of LRU free queue */
+		}
 	}
+}
 
-	wake_up(&inode_wait);
-	if (inode->i_pipe) {
-		unsigned long page = (unsigned long) PIPE_BASE(*inode);
-		PIPE_BASE(*inode) = NULL;
-		free_page(page);
-	}
+static kmem_cache_t *inode_cachep;
 
-	if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->put_inode) {
-		inode->i_sb->s_op->put_inode(inode);
-		if (!inode->i_nlink)
+static void grow_inodes(void)
+{
+	int i = 16;
+
+	while(i--) {
+		struct inode *inode;
+		
+		inode = kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+		if(!inode)
 			return;
+		memset(inode, 0, sizeof(*inode));
+		put_inode_head(inode);
+		nr_inodes++;
 	}
+}
 
-	if (inode->i_dirt) {
-		write_inode(inode);	/* we can sleep - so do again */
-		wait_on_inode(inode);
-		goto repeat;
-	}
+/* We have to be really careful, it's really easy to run yourself into
+ * inefficient sequences of events.  The first problem is that when you
+ * steal a non-referenced inode you run the risk of zaping a considerable
+ * number of page cache entries, which might get refernced once again.
+ * But if you are growing the inode set to quickly, you suck up ram
+ * and cause other problems.
+ *
+ * We approach the problem in the following way, we take two things into
+ * consideration.  Firstly we take a look at how much we have "committed"
+ * to this inode already (i_nrpages), this accounts for the cost of getting
+ * those pages back if someone should reference that inode soon.  We also
+ * attempt to factor in i_blocks, which says "how much of a problem could
+ * this potentially be".  It still needs some tuning though.  -DaveM
+ */
+#define BLOCK_FACTOR_SHIFT	5	/* It is not factored in as much. */
+static struct inode *find_best_candidate_weighted(struct inode *inode)
+{
+	struct inode *best = NULL;
 
-	if (IS_WRITABLE(inode)) {
-		if (inode->i_sb && inode->i_sb->dq_op) {
-			/* Here we can sleep also. Let's do it again
-			 * Dmitry Gorodchanin 02/11/96 
-			 */
-			inode->i_lock = 1;
-			inode->i_sb->dq_op->drop(inode);
-			unlock_inode(inode);
-			goto repeat;
-		}
+	if(inode) {
+		unsigned long bestscore = 1000;
+		int limit = nr_free_inodes >> 2;
+		do {
+			if(!(inode->i_lock | inode->i_dirt)) {
+				int myscore = inode->i_nrpages;
+
+				myscore += (inode->i_blocks >> BLOCK_FACTOR_SHIFT);
+				if(myscore < bestscore) {
+					bestscore = myscore;
+					best = inode;
+				}
+			}
+			inode = inode->i_next;
+		} while(inode && --limit);
 	}
-	
-	inode->i_count--;
+	return best;
+}
 
-	if (inode->i_mmap) {
-		printk("iput: inode %lu on device %s still has mappings.\n",
-			inode->i_ino, kdevname(inode->i_dev));
-		inode->i_mmap = NULL;
+static inline struct inode *find_best_free(struct inode *inode)
+{
+	if(inode) {
+		int limit = nr_free_inodes >> 5;
+		do {
+			if(!inode->i_nrpages)
+				return inode;
+			inode = inode->i_next;
+		} while(inode && --limit);
 	}
-
-	nr_free_inodes++;
-	return;
+	return NULL;
 }
 
-struct inode * get_empty_inode(void)
+struct inode *get_empty_inode(void)
 {
 	static int ino = 0;
-	struct inode * inode, * best;
-	unsigned long badness;
-	int i;
+	struct inode *inode;
 
-	if (nr_inodes < max_inodes && nr_free_inodes < (nr_inodes >> 1))
-		grow_inodes();
 repeat:
-	inode = first_inode;
-	best = NULL;
-	badness = 1000;
-	for (i = nr_inodes/2; i > 0; i--,inode = inode->i_next) {
-		if (!inode->i_count) {
-			unsigned long i = 999;
-			if (!(inode->i_lock | inode->i_dirt))
-				i = inode->i_nrpages;
-			if (i < badness) {
-				best = inode;
-				if (!i)
-					goto found_good;
-				badness = i;
-			}
-		}
-	}
-	if (nr_inodes < max_inodes) {
-		if (grow_inodes() == 0)
-			goto repeat;
-		best = NULL;
-	}
-	if (!best) {
-		printk("VFS: No free inodes - contact Linus\n");
-		sleep_on(&inode_wait);
+	inode = find_best_free(free_inodes.head);
+	if(!inode)
+		goto pressure;
+got_it:
+	inode->i_count++;
+	truncate_inode_pages(inode, 0);
+	wait_on_inode(inode);
+	if(IS_WRITABLE(inode) && inode->i_sb && inode->i_sb->dq_op)
+		inode->i_sb->dq_op->drop(inode);
+	unhash_inode(inode);
+	remove_free_inode(inode);
+
+	memset(inode, 0, sizeof(*inode));
+	inode->i_count = 1;
+	inode->i_nlink = 1;
+	inode->i_version = ++event;
+	sema_init(&inode->i_sem, 1);
+	inode->i_ino = ++ino;
+	inode->i_dev = 0;
+	put_inuse(inode);
+	return inode;
+pressure:
+	if(nr_inodes < max_inodes) {
+		grow_inodes();
 		goto repeat;
 	}
-	if (best->i_lock) {
-		wait_on_inode(best);
+	inode = find_best_candidate_weighted(free_inodes.head);
+	if(!inode) {
+		printk("VFS: No free inodes, contact DaveM\n");
+		sleep_on(&inode_wait);
 		goto repeat;
 	}
-	if (best->i_dirt) {
-		write_inode(best);
+	if(inode->i_lock) {
+		wait_on_inode(inode);
 		goto repeat;
-	}
-	if (best->i_count)
+	} else if(inode->i_dirt) {
+		write_inode(inode);
 		goto repeat;
-found_good:
-	clear_inode(best);
-	best->i_count = 1;
-	best->i_nlink = 1;
-	best->i_version = ++event;
-	sema_init(&best->i_sem, 1);
-	best->i_ino = ++ino;
-	best->i_dev = 0;
-	nr_free_inodes--;
-	if (nr_free_inodes < 0) {
-		printk ("VFS: get_empty_inode: bad free inode count.\n");
-		nr_free_inodes = 0;
 	}
-	return best;
+	goto got_it;
 }
 
-struct inode * get_pipe_inode(void)
+struct inode *get_pipe_inode(void)
 {
-	struct inode * inode;
 	extern struct inode_operations pipe_inode_operations;
+	struct inode *inode = get_empty_inode();
 
-	if (!(inode = get_empty_inode()))
-		return NULL;
-	if (!(PIPE_BASE(*inode) = (char*) __get_free_page(GFP_USER))) {
-		iput(inode);
-		return NULL;
-	}
-	inode->i_op = &pipe_inode_operations;
-	inode->i_count = 2;	/* sum of readers/writers */
-	PIPE_WAIT(*inode) = NULL;
-	PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
-	PIPE_RD_OPENERS(*inode) = PIPE_WR_OPENERS(*inode) = 0;
-	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
-	PIPE_LOCK(*inode) = 0;
-	inode->i_pipe = 1;
-	inode->i_mode |= S_IFIFO | S_IRUSR | S_IWUSR;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blksize = PAGE_SIZE;
+	if(inode) {
+		unsigned long page = __get_free_page(GFP_USER);
+		if(!page) {
+			iput(inode);
+			inode = NULL;
+		} else {
+			PIPE_BASE(*inode) = (char *) page;
+			inode->i_op = &pipe_inode_operations;
+			inode->i_count = 2;
+			PIPE_WAIT(*inode) = NULL;
+			PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
+			PIPE_RD_OPENERS(*inode) = PIPE_WR_OPENERS(*inode) = 0;
+			PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
+			PIPE_LOCK(*inode) = 0;
+			inode->i_pipe = 1;
+			inode->i_mode |= S_IFIFO | S_IRUSR | S_IWUSR;
+			inode->i_uid = current->fsuid;
+			inode->i_gid = current->fsgid;
+			inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+			inode->i_blksize = PAGE_SIZE;
+		}
+	}
 	return inode;
 }
 
-struct inode *__iget(struct super_block * sb, int nr, int crossmntp)
+static int inode_updating[INODE_HASHSZ];
+
+struct inode *__iget(struct super_block *sb, int nr, int crossmntp)
 {
-	static struct wait_queue * update_wait = NULL;
-	struct inode_hash_entry * h;
-	struct inode * inode;
-	struct inode * empty = NULL;
-
-	if (!sb)
-		panic("VFS: iget with sb==NULL");
-	h = hash(sb->s_dev, nr);
-repeat:
-	for (inode = h->inode; inode ; inode = inode->i_hash_next)
-		if (inode->i_dev == sb->s_dev && inode->i_ino == nr)
-			goto found_it;
-	if (!empty) {
-		/*
-		 * If we sleep here before we have found an inode
-		 * we need to make sure nobody does anything bad
-		 * to the inode while we sleep, because otherwise
-		 * we may return an inode that is not valid any
-		 * more when we wake up..
-		 */
-		h->updating++;
-		empty = get_empty_inode();
-		if (!--h->updating)
-			wake_up(&update_wait);
-		if (empty)
-			goto repeat;
-		return (NULL);
-	}
-	inode = empty;
-	inode->i_sb = sb;
-	inode->i_dev = sb->s_dev;
-	inode->i_ino = nr;
-	inode->i_flags = sb->s_flags;
-	put_last_free(inode);
-	insert_inode_hash(inode);
-	read_inode(inode);
-	goto return_it;
+	unsigned int hashent = hashfn(sb->s_dev, nr);
+	struct inode *inode, *empty = NULL;
 
-found_it:
-	if (!inode->i_count)
-		nr_free_inodes--;
-	inode->i_count++;
-	wait_on_inode(inode);
-	if (inode->i_dev != sb->s_dev || inode->i_ino != nr) {
-		printk("Whee.. inode changed from under us. Tell Linus\n");
-		iput(inode);
-		goto repeat;
-	}
-	if (crossmntp && inode->i_mount) {
-		struct inode * tmp = inode->i_mount;
-		tmp->i_count++;
-		iput(inode);
-		inode = tmp;
+we_slept:
+	if((inode = find_inode(hashent, sb->s_dev, nr)) == NULL) {
+		if(empty == NULL) {
+			inode_updating[hashent]++;
+			empty = get_empty_inode();
+			if(!--inode_updating[hashent])
+				wake_up(&update_wait);
+			goto we_slept;
+		}
+		inode = empty;
+		inode->i_sb = sb;
+		inode->i_dev = sb->s_dev;
+		inode->i_ino = nr;
+		inode->i_flags = sb->s_flags;
+		hash_inode(inode);
+		read_inode(inode);
+	} else {
+		if(!inode->i_count++) {
+			remove_free_inode(inode);
+			put_inuse(inode);
+		}
 		wait_on_inode(inode);
+		if(crossmntp && inode->i_mount) {
+			struct inode *mp = inode->i_mount;
+			mp->i_count++;
+			iput(inode);
+			wait_on_inode(inode = mp);
+		}
+		if(empty)
+			iput(empty);
 	}
-	if (empty)
-		iput(empty);
-
-return_it:
-	while (h->updating)
+	while(inode_updating[hashent])
 		sleep_on(&update_wait);
 	return inode;
 }
 
-/*
- * The "new" scheduling primitives (new as of 0.97 or so) allow this to
- * be done without disabling interrupts (other than in the actual queue
- * updating things: only a couple of 386 instructions). This should be
- * much better for interrupt latency.
- */
-static void __wait_on_inode(struct inode * inode)
+void inode_init(void)
 {
-	struct wait_queue wait = { current, NULL };
+	int i;
 
-	add_wait_queue(&inode->i_wait, &wait);
-repeat:
-	current->state = TASK_UNINTERRUPTIBLE;
-	if (inode->i_lock) {
-		schedule();
-		goto repeat;
-	}
-	remove_wait_queue(&inode->i_wait, &wait);
-	current->state = TASK_RUNNING;
+	inode_cachep = kmem_cache_create("inode", sizeof(struct inode),
+					 sizeof(unsigned long) * 4,
+					 SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if(!inode_cachep)
+		panic("Cannot create inode SLAB cache\n");
+
+	for(i = 0; i < INODE_HASHSZ; i++)
+		inode_hash[i] = NULL;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov