patch-2.1.43 linux/fs/inode.c

Next file: linux/fs/isofs/dir.c
Previous file: linux/fs/hpfs/hpfs_fs.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.42/linux/fs/inode.c linux/fs/inode.c
@@ -1,657 +1,708 @@
 /*
- * linux/fs/inode.c: Keeping track of inodes.
+ * fs/inode.c
  *
- * Copyright (C) 1991, 1992  Linus Torvalds
- * Copyright (C) 1997 David S. Miller
+ * Complete reimplementation
+ * (C) 1997 Thomas Schoebel-Theuer
  */
 
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
+/* Everything here is intended to be MP-safe. However, other parts
+ * of the kernel are not yet MP-safe, in particular the inode->i_count++
+ * that are spread over everywhere. These should be replaced by
+ * iinc() as soon as possible. Since I have no MP machine, I could
+ * not test it.
+ */
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
 #include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/dlists.h>
+#include <linux/dalloc.h>
+#include <linux/omirr.h>
+
+/* #define DEBUG */
+
+#define HASH_SIZE 1024 /* must be a power of 2 */
+#define NR_LEVELS 4
+
+#define ST_AGED      1
+#define ST_HASHED    2
+#define ST_EMPTY     4
+#define ST_TO_READ   8
+#define ST_TO_WRITE 16
+#define ST_TO_PUT   32
+#define ST_TO_DROP  64
+#define ST_IO       (ST_TO_READ|ST_TO_WRITE|ST_TO_PUT|ST_TO_DROP)
+#define ST_WAITING 128
+#define ST_FREEING 256
+#define ST_IBASKET 512
+
+/* The idea is to keep empty inodes in a separate list, so no search
+ * is required as long as empty inodes exit.
+ * All reusable inodes occurring in the hash table with i_count==0
+ * are also registered in the ringlist aged_i[level], but in LRU order.
+ * Used inodes with i_count>0 are kept solely in the hashtable and in
+ * all_i, but in no other list.
+ * The level is used for multilevel aging to avoid thrashing; each
+ * time i_count decreases to 0, the inode is inserted into the next level
+ * ringlist. Cache reusage is simply by taking the _last_ element from the
+ * lowest-level ringlist that contains inodes.
+ * In contrast to the old code, there isn't any O(n) search overhead now
+ * in iget/iput (if you make HASH_SIZE large enough).
+ */
+static struct inode * hashtable[HASH_SIZE];/* linked with i_hash_{next,prev} */
+static struct inode * all_i = NULL;        /* linked with i_{next,prev} */
+static struct inode * empty_i = NULL;      /* linked with i_{next,prev} */
+static struct inode * aged_i[NR_LEVELS+1]; /* linked with i_lru_{next,prev} */
+static int aged_reused[NR_LEVELS+1];       /* # removals from aged_i[level] */
+static int age_table[NR_LEVELS+1] = { /* You may tune this. */
+	1, 4, 10, 100, 1000
+}; /* after which # of uses to increase to the next level */
+
+/* This is for kernel/sysctl.c */
+
+/* Just aligning plain ints and arrays thereof doesn't work reliably.. */
+struct {
+	int nr_inodes;
+	int nr_free_inodes;
+	int aged_count[NR_LEVELS+1];        /* # in each level */
+} inodes_stat;
 
-int nr_inodes = 0, nr_free_inodes = 0;
 int max_inodes = NR_INODE;
+unsigned long last_inode = 0;
 
-#define INODE_HASHSZ	1024
-
-static struct inode *inode_hash[INODE_HASHSZ];
-
-/* All the details of hashing and lookup. */
-#define hashfn(dev, i) ((HASHDEV(dev) + ((i) ^ ((i) >> 10))) & (INODE_HASHSZ - 1))
-
-__inline__ void insert_inode_hash(struct inode *inode)
-{
-	struct inode **htable = &inode_hash[hashfn(inode->i_dev, inode->i_ino)];
-	if((inode->i_hash_next = *htable) != NULL)
-		(*htable)->i_hash_pprev = &inode->i_hash_next;
-	*htable = inode;
-	inode->i_hash_pprev = htable;
-}
-
-#define hash_inode(inode) insert_inode_hash(inode)
-
-static inline void unhash_inode(struct inode *inode)
-{
-	if(inode->i_hash_pprev) {
-		if(inode->i_hash_next)
-			inode->i_hash_next->i_hash_pprev = inode->i_hash_pprev;
-		*(inode->i_hash_pprev) = inode->i_hash_next;
-		inode->i_hash_pprev = NULL;
-	}
-}
-
-static inline struct inode *find_inode(unsigned int hashent,
-				       kdev_t dev, unsigned long ino)
+void inode_init(void)
 {
-	struct inode *inode;
-
-	for(inode = inode_hash[hashent]; inode; inode = inode->i_hash_next)
-		if(inode->i_dev == dev && inode->i_ino == ino)
-			break;
-	return inode;
+	memset(hashtable, 0, sizeof(hashtable));
+	memset(aged_i, 0, sizeof(aged_i));
+	memset(aged_reused, 0, sizeof(aged_reused));
+	memset(&inodes_stat, 0, sizeof(inodes_stat));
 }
 
-/* Free list queue and management. */
-static struct free_inode_queue {
-	struct inode *head;
-	struct inode **last;
-} free_inodes = { NULL, &free_inodes.head };
-
-static inline void put_inode_head(struct inode *inode)
-{
-	if((inode->i_next = free_inodes.head) != NULL)
-		free_inodes.head->i_pprev = &inode->i_next;
-	else
-		free_inodes.last = &inode->i_next;
-	free_inodes.head = inode;
-	inode->i_pprev = &free_inodes.head;
-	nr_free_inodes++;
-}
+/* Intended for short locks of the above global data structures.
+ * Could be replaced with spinlocks completely, since there is
+ * no blocking during manipulation of the static data; however the
+ * lock in invalidate_inodes() may last relatively long.
+ */
+#ifdef __SMP__
+struct semaphore vfs_sem = MUTEX;
+#endif
+
+DEF_INSERT(all,struct inode,i_next,i_prev)
+DEF_REMOVE(all,struct inode,i_next,i_prev)
+	
+DEF_INSERT(lru,struct inode,i_lru_next,i_lru_prev)
+DEF_REMOVE(lru,struct inode,i_lru_next,i_lru_prev)
+
+DEF_INSERT(hash,struct inode,i_hash_next,i_hash_prev)
+DEF_REMOVE(hash,struct inode,i_hash_next,i_hash_prev)
+
+DEF_INSERT(ibasket,struct inode,i_basket_next,i_basket_prev)
+DEF_REMOVE(ibasket,struct inode,i_basket_next,i_basket_prev)
+
+#ifdef DEBUG
+extern void printpath(struct dentry * entry);
+struct inode * xtst[15000];
+int xcnt = 0;
 
-static inline void put_inode_last(struct inode *inode)
+void xcheck(char * txt, struct inode * p)
 {
-	inode->i_next = NULL;
-	inode->i_pprev = free_inodes.last;
-	*free_inodes.last = inode;
-	free_inodes.last = &inode->i_next;
-	nr_free_inodes++;
+	int i;
+	for(i=xcnt-1; i>=0; i--)
+		if(xtst[i] == p)
+			return;
+	printk("Bogus inode %p in %s\n", p, txt);
 }
+#else
+#define xcheck(t,p) /*nothing*/
+#endif
 
-static inline void remove_free_inode(struct inode *inode)
+static inline struct inode * grow_inodes(void)
 {
-	if(inode->i_pprev) {
-		if(inode->i_next)
-			inode->i_next->i_pprev = inode->i_pprev;
-		else
-			free_inodes.last = inode->i_pprev;
-		*inode->i_pprev = inode->i_next;
-		inode->i_pprev = NULL;
-		nr_free_inodes--;
+	struct inode * res;
+	struct inode * inode = res = (struct inode*)__get_free_page(GFP_KERNEL);
+	int size = PAGE_SIZE;
+	if(!inode)
+		return NULL;
+	
+	size -= sizeof(struct inode);
+	inode++;
+	inodes_stat.nr_inodes++;
+#ifdef DEBUG
+xtst[xcnt++]=res;
+#endif
+	while(size >= sizeof(struct inode)) {
+#ifdef DEBUG
+xtst[xcnt++]=inode;
+#endif
+		inodes_stat.nr_inodes++;
+		inodes_stat.nr_free_inodes++;
+		insert_all(&empty_i, inode);
+		inode->i_status = ST_EMPTY;
+		inode++;
+		size -= sizeof(struct inode);
 	}
+	return res;
 }
 
-/* This is the in-use queue, if i_count > 0 (as far as we can tell)
- * the sucker is here.
- */
-static struct inode *inuse_list = NULL;
-
-static inline void put_inuse(struct inode *inode)
+static inline int hash(dev_t i_dev, unsigned long i_ino)
 {
-	if((inode->i_next = inuse_list) != NULL)
-		inuse_list->i_pprev = &inode->i_next;
-	inuse_list = inode;
-	inode->i_pprev = &inuse_list;
+	return ((int)i_ino ^ ((int)i_dev << 6)) & (HASH_SIZE-1);
 }
 
-static inline void remove_inuse(struct inode *inode)
+static inline blocking void wait_io(struct inode * inode, unsigned short flags)
 {
-	if(inode->i_pprev) {
-		if(inode->i_next)
-			inode->i_next->i_pprev = inode->i_pprev;
-		*inode->i_pprev = inode->i_next;
-		inode->i_pprev = NULL;
+	while(inode->i_status & flags) {
+		struct wait_queue wait = {current, NULL};
+		inode->i_status |= ST_WAITING;
+		vfs_unlock();
+		add_wait_queue(&inode->i_wait, &wait);
+		sleep_on(&inode->i_wait);
+		remove_wait_queue(&inode->i_wait, &wait);
+		vfs_lock();
 	}
 }
 
-/* Locking and unlocking inodes, plus waiting for locks to clear. */
-static void __wait_on_inode(struct inode *);
-
-static inline void wait_on_inode(struct inode *inode)
+static inline blocking void set_io(struct inode * inode,
+				   unsigned short waitflags,
+				   unsigned short setflags)
 {
-	if(inode->i_lock)
-		__wait_on_inode(inode);
+	wait_io(inode, waitflags);
+	inode->i_status |= setflags;
+	vfs_unlock();
 }
 
-static inline void lock_inode(struct inode *inode)
+static inline blocking int release_io(struct inode * inode, unsigned short flags)
 {
-	if(inode->i_lock)
-		__wait_on_inode(inode);
-	inode->i_lock = 1;
-}
-
-static inline void unlock_inode(struct inode *inode)
-{
-	inode->i_lock = 0;
-	wake_up(&inode->i_wait);
+	int res = 0; 
+	vfs_lock();
+	inode->i_status &= ~flags;
+	if(inode->i_status & ST_WAITING) {
+		inode->i_status &= ~ST_WAITING;
+		vfs_unlock();
+		wake_up(&inode->i_wait);
+		res = 1;
+	}
+	return res;
 }
 
-static void __wait_on_inode(struct inode * inode)
+static inline blocking void _io(void (*op)(struct inode*), struct inode * inode,
+				unsigned short waitflags, unsigned short setflags)
 {
-	struct wait_queue wait = { current, NULL };
-
-	add_wait_queue(&inode->i_wait, &wait);
-repeat:
-	current->state = TASK_UNINTERRUPTIBLE;
-	if (inode->i_lock) {
-		schedule();
-		goto repeat;
+	/* Do nothing if the same op is already in progress. */
+	if(op && !(inode->i_status & setflags)) {
+		set_io(inode, waitflags, setflags);
+		op(inode);
+		if(release_io(inode, setflags)) {
+			/* Somebody grabbed my inode from under me. */
+#ifdef DEBUG
+			printk("_io grab!\n");
+#endif
+                        vfs_lock();
+		}
 	}
-	remove_wait_queue(&inode->i_wait, &wait);
-	current->state = TASK_RUNNING;
 }
 
-/* Clear an inode of all it's identity, this is exported to the world. */
-void clear_inode(struct inode *inode)
+blocking int _free_ibasket(struct super_block * sb)
 {
-	struct wait_queue *wait;
-
-	/* So we don't disappear. */
-	inode->i_count++;
-
-	truncate_inode_pages(inode, 0);
-	wait_on_inode(inode);
-	if(IS_WRITABLE(inode) && inode->i_sb && inode->i_sb->dq_op)
-		inode->i_sb->dq_op->drop(inode);
-
-	if(--inode->i_count > 0)
-		remove_inuse(inode);
-	else
-		remove_free_inode(inode);
-	unhash_inode(inode);
-	wait = inode->i_wait;
-	memset(inode, 0, sizeof(*inode)); barrier();
-	inode->i_wait = wait;
-	put_inode_head(inode);	/* Pages zapped, put at the front. */
+	if(sb->s_ibasket) {
+		struct inode * delinquish = sb->s_ibasket->i_basket_prev;
+#if 0
+printpath(delinquish->i_dentry);
+printk(" delinquish\n");
+#endif
+		_clear_inode(delinquish, 0, 1);
+		return 1;
+	}
+	return 0;
 }
 
-/* These check the validity of a mount/umount type operation, we essentially
- * check if there are any inodes hanging around which prevent this operation
- * from occurring.  We also clear out clean inodes referencing this device.
- */
-int fs_may_mount(kdev_t dev)
+static /*inline*/ void _put_ibasket(struct inode * inode)
 {
-	struct inode *inode;
-	int pass = 0;
-
-	inode = free_inodes.head;
-repeat:
-	while(inode) {
-		struct inode *next = inode->i_next;
-		if(inode->i_dev != dev)
-			goto next;
-		if(inode->i_count || inode->i_dirt || inode->i_lock)
-			return 0;
-		clear_inode(inode);
-	next:
-		inode = next;
+	struct super_block * sb = inode->i_sb;
+	if(!(inode->i_status & ST_IBASKET)) {
+		inode->i_status |= ST_IBASKET;
+		insert_ibasket(&sb->s_ibasket, inode);
+		sb->s_ibasket_count++;
+		if(sb->s_ibasket_count > sb->s_ibasket_max)
+			(void)_free_ibasket(sb);
 	}
-	if(pass == 0) {
-		inode = inuse_list;
-		pass = 1;
-		goto repeat;
-	}
-	return 1; /* Tis' cool bro. */
 }
 
-int fs_may_umount(kdev_t dev, struct inode *iroot)
-{
-	struct inode *inode;
-	int pass = 0;
-
-	inode = free_inodes.head;
-repeat:
-	for(; inode; inode = inode->i_next) {
-		if(inode->i_dev != dev || !inode->i_count)
-			continue;
-		if(inode == iroot &&
-		   (inode->i_count == (inode->i_mount == inode ? 2 : 1)))
-			continue;
-		return 0;
-	}
-	if(pass == 0) {
-		inode = inuse_list;
-		pass = 1;
-		goto repeat;
+blocking void _clear_inode(struct inode * inode, int external, int verbose)
+{
+xcheck("_clear_inode",inode);
+	if(inode->i_status & ST_IBASKET) {
+		struct super_block * sb = inode->i_sb;
+		remove_ibasket(&sb->s_ibasket, inode);
+		sb->s_ibasket_count--;
+		inode->i_status &= ~ST_IBASKET;
+#if 0
+printpath(inode->i_dentry);
+printk(" put_inode\n");
+#endif
+		_io(sb->s_op->put_inode, inode, ST_TO_PUT|ST_TO_WRITE, ST_TO_PUT);
+		if(inode->i_status & ST_EMPTY)
+			return;
 	}
-	return 1; /* Tis' cool bro. */
+	if(inode->i_status & ST_HASHED)
+		remove_hash(&hashtable[hash(inode->i_dev, inode->i_ino)], inode);
+	if(inode->i_status & ST_AGED) {
+		/* "cannot happen" when called from an fs because at least
+		 * the caller must use it. Can happen when called from
+		 * invalidate_inodes(). */
+		if(verbose)
+			printk("VFS: clearing aged inode\n");
+		if(atomic_read(&inode->i_count))
+			printk("VFS: aged inode is in use\n");
+		remove_lru(&aged_i[inode->i_level], inode);
+		inodes_stat.aged_count[inode->i_level]--;
+	}
+	if(!external && inode->i_status & ST_IO) {
+		printk("VFS: clearing inode during IO operation\n");
+	}
+	if(!(inode->i_status & ST_EMPTY)) {
+		remove_all(&all_i, inode);
+		inode->i_status = ST_EMPTY;
+		while(inode->i_dentry) {
+			d_del(inode->i_dentry, D_NO_CLEAR_INODE);
+		}
+		if(inode->i_pages) {
+			vfs_unlock(); /* may block, can that be revised? */
+			truncate_inode_pages(inode, 0);
+			vfs_lock();
+		}
+		insert_all(&empty_i, inode);
+		inodes_stat.nr_free_inodes++;
+	} else if(external)
+		printk("VFS: empty inode is unnecessarily cleared multiple "
+		       "times by an fs\n");
+        else
+		printk("VFS: clearing empty inode\n");
+	inode->i_status = ST_EMPTY;
+	/* The inode is not really cleared any more here, but only once
+	 * when taken from empty_i. This saves instructions and processor
+	 * cache pollution.
+	 */
+}
+
+void insert_inode_hash(struct inode * inode)
+{
+xcheck("insert_inode_hash",inode);
+	vfs_lock();
+	if(!(inode->i_status & ST_HASHED)) {
+		insert_hash(&hashtable[hash(inode->i_dev, inode->i_ino)], inode);
+		inode->i_status |= ST_HASHED;
+	} else
+		printk("VFS: trying to hash an inode again\n");
+	vfs_unlock();
 }
 
-/* This belongs in file_table.c, not here... */
-int fs_may_remount_ro(kdev_t dev)
+blocking struct inode * _get_empty_inode(void)
 {
-	struct file * file;
+	struct inode * inode;
+	int retry = 0;
 
-	/* Check that no files are currently opened for writing. */
-	for (file = inuse_filps; file; file = file->f_next) {
-		if (!file->f_inode || file->f_inode->i_dev != dev)
-			continue;
-		if (S_ISREG(file->f_inode->i_mode) && (file->f_mode & 2))
-			return 0;
-	}
-	return 1; /* Tis' cool bro. */
-}
-
-/* Reading/writing inodes. */
-static void write_inode(struct inode *inode)
-{
-	if(inode->i_dirt) {
-		wait_on_inode(inode);
-		if(inode->i_dirt) {
-			if(inode->i_sb		&&
-			   inode->i_sb->s_op	&&
-			   inode->i_sb->s_op->write_inode) {
-				inode->i_lock = 1;
-				inode->i_sb->s_op->write_inode(inode);
-				unlock_inode(inode);
-			} else {
-				inode->i_dirt = 0;
+retry:
+	inode = empty_i;
+	if(inode) {
+		remove_all(&empty_i, inode);
+		inodes_stat.nr_free_inodes--;
+	} else if(inodes_stat.nr_inodes < max_inodes || retry > 2) {
+		inode = grow_inodes();
+	}
+	if(!inode) {
+		int level;
+		int usable = 0;
+		for(level = 0; level <= NR_LEVELS; level++)
+			if(aged_i[level]) {
+				inode = aged_i[level]->i_lru_prev;
+				/* Here is the picking strategy, tune this */
+				if(aged_reused[level] < (usable++ ?
+							 inodes_stat.aged_count[level] :
+							 2))
+					break;
+				aged_reused[level] = 0;
 			}
+		if(inode) {
+			if(!(inode->i_status & ST_AGED))
+				printk("VFS: inode aging inconsistency\n");
+			if(atomic_read(&inode->i_count) + inode->i_ddir_count)
+				printk("VFS: i_count of aged inode is not zero\n");
+			if(inode->i_dirt)
+				printk("VFS: Hey, somebody made my aged inode dirty\n");
+			_clear_inode(inode, 0, 0);
+			goto retry;
 		}
 	}
+	if(!inode) {
+		vfs_unlock();
+		schedule();
+		if(retry > 10)
+			panic("VFS: cannot repair inode shortage");
+		if(retry > 2)
+			printk("VFS: no free inodes\n");
+		retry++;
+		vfs_lock();
+		goto retry;
+	}
+xcheck("get_empty_inode",inode);
+	memset(inode, 0, sizeof(struct inode));
+	atomic_set(&inode->i_count, 1);
+	inode->i_nlink = 1;
+	sema_init(&inode->i_sem, 1);
+	inode->i_ino = ++last_inode;
+	inode->i_version = ++event;
+	insert_all(&all_i, inode);
+	return inode;
 }
 
-static inline void read_inode(struct inode *inode)
+static inline blocking struct inode * _get_empty_inode_hashed(dev_t i_dev,
+							      unsigned long i_ino)
 {
-	if(inode->i_sb		&&
-	   inode->i_sb->s_op	&&
-	   inode->i_sb->s_op->read_inode) {
-		lock_inode(inode);
-		inode->i_sb->s_op->read_inode(inode);
-		unlock_inode(inode);
-	}
-}
-
-int inode_change_ok(struct inode *inode, struct iattr *attr)
-{
-	if(!(attr->ia_valid & ATTR_FORCE)) {
-		unsigned short fsuid = current->fsuid;
-		uid_t iuid = inode->i_uid;
-		int not_fsuser = !fsuser();
-
-		if(((attr->ia_valid & ATTR_UID) &&
-		    ((fsuid != iuid) ||
-		     (attr->ia_uid != iuid)) && not_fsuser)	||
-
-		   ((attr->ia_valid & ATTR_GID) &&
-		    (!in_group_p(attr->ia_gid) &&
-		     (attr->ia_gid != inode->i_gid)) && not_fsuser)	||
-
-		   ((attr->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)) &&
-		    (fsuid != iuid) && not_fsuser))
-			return -EPERM;
-
-		if(attr->ia_valid & ATTR_MODE) {
-			gid_t grp;
-			if(fsuid != iuid && not_fsuser)
-				return -EPERM;
-			grp = attr->ia_valid & ATTR_GID ? attr->ia_gid : inode->i_gid;
-			if(not_fsuser && !in_group_p(grp))
-				attr->ia_mode &= ~S_ISGID;
-		}
-	}
-	return 0;
+	struct inode ** base = &hashtable[hash(i_dev, i_ino)];
+	struct inode * inode = *base;
+	if(inode) do {
+		if(inode->i_ino == i_ino && inode->i_dev == i_dev) {
+			atomic_inc(&inode->i_count);
+			printk("VFS: inode %lx is already in use\n", i_ino);
+			return inode;
+		}
+		inode = inode->i_hash_next;
+	} while(inode != *base);
+	inode = _get_empty_inode();
+	inode->i_dev = i_dev;
+	inode->i_ino = i_ino;
+	insert_hash(base, inode);
+	inode->i_status |= ST_HASHED;
+	return inode;
 }
 
-void inode_setattr(struct inode *inode, struct iattr *attr)
+blocking struct inode * get_empty_inode_hashed(dev_t i_dev, unsigned long i_ino)
 {
-	if (attr->ia_valid & ATTR_UID)
-		inode->i_uid = attr->ia_uid;
-	if (attr->ia_valid & ATTR_GID)
-		inode->i_gid = attr->ia_gid;
-	if (attr->ia_valid & ATTR_SIZE)
-		inode->i_size = attr->ia_size;
-	if (attr->ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
-	if (attr->ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
-	if (attr->ia_valid & ATTR_CTIME)
-		inode->i_ctime = attr->ia_ctime;
-	if (attr->ia_valid & ATTR_MODE) {
-		inode->i_mode = attr->ia_mode;
-		if (!fsuser() && !in_group_p(inode->i_gid))
-			inode->i_mode &= ~S_ISGID;
-	}
-	if (attr->ia_valid & ATTR_ATTR_FLAG)
-		inode->i_attr_flags = attr->ia_attr_flags;
-	inode->i_dirt = 1;
-}
-
-int notify_change(struct inode *inode, struct iattr *attr)
-{
-	attr->ia_ctime = CURRENT_TIME;
-	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
-		if (!(attr->ia_valid & ATTR_ATIME_SET))
-			attr->ia_atime = attr->ia_ctime;
-		if (!(attr->ia_valid & ATTR_MTIME_SET))
-			attr->ia_mtime = attr->ia_ctime;
-	}
-
-	if (inode->i_sb		&&
-	    inode->i_sb->s_op	&&
-	    inode->i_sb->s_op->notify_change) 
-		return inode->i_sb->s_op->notify_change(inode, attr);
+	struct inode * inode;
 
-	if(inode_change_ok(inode, attr) != 0)
-		return -EPERM;
-
-	inode_setattr(inode, attr);
-	return 0;
-}
-
-int bmap(struct inode *inode, int block)
-{
-	if(inode->i_op && inode->i_op->bmap)
-		return inode->i_op->bmap(inode, block);
-	return 0;
+	vfs_lock();
+	inode = _get_empty_inode_hashed(i_dev, i_ino);
+	vfs_unlock();
+	return inode;
 }
 
-void invalidate_inodes(kdev_t dev)
+void _get_inode(struct inode * inode)
 {
-	struct inode *inode;
-	int pass = 0;
-
-	inode = free_inodes.head;
-repeat:
-	while(inode) {
-		struct inode *next = inode->i_next;
-		if(inode->i_dev != dev)
-			goto next;
-		clear_inode(inode);
-	next:
-		inode = next;
-	}
-	if(pass == 0) {
-		inode = inuse_list;
-		pass = 1;
-		goto repeat;
-	}
-}
-
-void sync_inodes(kdev_t dev)
-{
-	struct inode *inode;
-	int pass = 0;
-
-	inode = free_inodes.head;
-repeat:
-	while(inode) {
-		struct inode *next = inode->i_next;
-		if(dev && inode->i_dev != dev)
-			goto next;
-		wait_on_inode(inode);
-		write_inode(inode);
-	next:
-		inode = next;
-	}
-	if(pass == 0) {
-		inode = inuse_list;
-		pass = 1;
-		goto repeat;
+	if(inode->i_status & ST_IBASKET) {
+		inode->i_status &= ~ST_IBASKET;
+		remove_ibasket(&inode->i_sb->s_ibasket, inode);
+		inode->i_sb->s_ibasket_count--;
+	}
+	if(inode->i_status & ST_AGED) {
+		inode->i_status &= ~ST_AGED;
+		remove_lru(&aged_i[inode->i_level], inode);
+		inodes_stat.aged_count[inode->i_level]--;
+		aged_reused[inode->i_level]++;
+		if(S_ISDIR(inode->i_mode))
+			/* make dirs less thrashable */
+			inode->i_level = NR_LEVELS-1;
+		else if(inode->i_nlink > 1)
+			/* keep hardlinks totally separate */
+			inode->i_level = NR_LEVELS;
+		else if(++inode->i_reuse_count >= age_table[inode->i_level]
+			&& inode->i_level < NR_LEVELS-1)
+			inode->i_level++;
+		if(atomic_read(&inode->i_count) != 1)
+			printk("VFS: inode count was not zero\n");
+	} else if(inode->i_status & ST_EMPTY)
+		printk("VFS: invalid reuse of empty inode\n");
+}
+
+blocking struct inode * __iget(struct super_block * sb,
+			       unsigned long i_ino,
+			       int crossmntp)
+{
+	struct inode ** base;
+	struct inode * inode;
+	dev_t i_dev;
+	
+	if(!sb)
+		panic("VFS: iget with sb == NULL");
+	i_dev = sb->s_dev;
+	if(!i_dev)
+		panic("VFS: sb->s_dev is NULL\n");
+	base = &hashtable[hash(i_dev, i_ino)];
+	vfs_lock();
+	inode = *base;
+	if(inode) do {
+		if(inode->i_ino == i_ino && inode->i_dev == i_dev) {
+			atomic_inc(&inode->i_count);
+			_get_inode(inode);
+
+			 /* Allow concurrent writes/puts. This is in particular
+			  * useful e.g. when syncing large chunks.
+			  * I hope the i_dirty flag is everywhere set as soon
+			  * as _any_ modifcation is made and _before_
+			  * giving up control, so no harm should occur if data
+			  * is modified during writes, because it will be
+			  * rewritten again (does a short inconsistency on the
+			  * disk harm?)
+			  */
+			wait_io(inode, ST_TO_READ);
+			vfs_unlock();
+			goto done;
+		}
+		inode = inode->i_hash_next;
+	} while(inode != *base);
+	inode = _get_empty_inode_hashed(i_dev, i_ino);
+	inode->i_sb = sb;
+	inode->i_flags = sb->s_flags;
+	if(sb->s_op && sb->s_op->read_inode) {
+		set_io(inode, 0, ST_TO_READ); /* do not wait at all */
+		sb->s_op->read_inode(inode);
+		if(release_io(inode, ST_TO_READ))
+			goto done;
+	}
+	vfs_unlock();
+done:
+	while(crossmntp && inode->i_mount) {
+		struct inode * tmp = inode->i_mount;
+		iinc(tmp);
+		iput(inode);
+		inode = tmp;
 	}
+xcheck("_iget",inode);
+	return inode;
 }
 
-static struct wait_queue *inode_wait, *update_wait;
-
-void iput(struct inode *inode)
+blocking void __iput(struct inode * inode)
 {
-	if(!inode)
-		return;
-	wait_on_inode(inode);
-	if(!inode->i_count) {
-		printk("VFS: Freeing free inode, tell DaveM\n");
+	struct super_block * sb;
+xcheck("_iput",inode);
+	if(atomic_read(&inode->i_count) + inode->i_ddir_count < 0)
+		printk("VFS: i_count is negative\n");
+	if((atomic_read(&inode->i_count) + inode->i_ddir_count) ||
+	   (inode->i_status & ST_FREEING)) {
 		return;
 	}
-	if(inode->i_pipe)
-		wake_up_interruptible(&PIPE_WAIT(*inode));
-we_slept:
-	if(inode->i_count > 1) {
-		inode->i_count--;
-	} else {
-		wake_up(&inode_wait);
-		if(inode->i_pipe) {
-			free_page((unsigned long)PIPE_BASE(*inode));
-			PIPE_BASE(*inode) = NULL;
-		}
-		if(inode->i_sb		&&
-		   inode->i_sb->s_op	&&
-		   inode->i_sb->s_op->put_inode) {
-			inode->i_sb->s_op->put_inode(inode);
-			if(!inode->i_nlink)
-				return;
-		}
-		if(inode->i_dirt) {
-			write_inode(inode);
-			wait_on_inode(inode);
-			goto we_slept;
-		}
-		if(IS_WRITABLE(inode)		&&
-		   inode->i_sb			&&
-		   inode->i_sb->dq_op) {
-			inode->i_lock = 1;
-			inode->i_sb->dq_op->drop(inode);
-			unlock_inode(inode);
-			goto we_slept;
-		}
-		/* There is a serious race leading to here, watch out. */
-		if(--inode->i_count == 0) {
-			remove_inuse(inode);
-			put_inode_last(inode);	/* Place at end of LRU free queue */
+	inode->i_status |= ST_FREEING;
+#ifdef CONFIG_OMIRR
+	if(inode->i_status & ST_MODIFIED) {
+		inode->i_status &= ~ST_MODIFIED;
+		omirr_printall(inode, " W %ld ", CURRENT_TIME);
+	}
+#endif
+	if(inode->i_pipe) {
+		free_page((unsigned long)PIPE_BASE(*inode));
+		PIPE_BASE(*inode)= NULL;
+	}
+	if((sb = inode->i_sb)) {
+		if(sb->s_type && (sb->s_type->fs_flags & FS_NO_DCACHE)) {
+			while(inode->i_dentry)
+				d_del(inode->i_dentry, D_NO_CLEAR_INODE);
+			if(atomic_read(&inode->i_count) + inode->i_ddir_count)
+				goto done;
+		}
+		if(sb->s_op) {
+			if(inode->i_nlink <= 0 && inode->i_dent_count &&
+			   !(inode->i_status & (ST_EMPTY|ST_IBASKET)) &&
+			   (sb->s_type->fs_flags & FS_IBASKET)) {
+				_put_ibasket(inode);
+				goto done;
+			}
+			if(!inode->i_dent_count ||
+			   (sb->s_type->fs_flags & FS_NO_DCACHE)) {
+				_io(sb->s_op->put_inode, inode, 
+				    ST_TO_PUT|ST_TO_WRITE, ST_TO_PUT);
+				if(atomic_read(&inode->i_count) + inode->i_ddir_count)
+					goto done;
+				if(inode->i_nlink <= 0) {
+					if(!(inode->i_status & ST_EMPTY)) {
+						_clear_inode(inode, 0, 1);
+					}
+					goto done;
+				}
+			}
+			if(inode->i_dirt) {
+				inode->i_dirt = 0;
+				_io(sb->s_op->write_inode, inode,
+				    ST_TO_PUT|ST_TO_WRITE, ST_TO_WRITE);
+				if(atomic_read(&inode->i_count) + inode->i_ddir_count)
+					goto done;
+			}
 		}
-	}
-}
-
-static kmem_cache_t *inode_cachep;
-
-static void grow_inodes(void)
-{
-	int i = 16;
-
-	while(i--) {
-		struct inode *inode;
-		
-		inode = kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
-		if(!inode)
-			return;
-		memset(inode, 0, sizeof(*inode));
-		put_inode_head(inode);
-		nr_inodes++;
-	}
-}
-
-/* We have to be really careful, it's really easy to run yourself into
- * inefficient sequences of events.  The first problem is that when you
- * steal a non-referenced inode you run the risk of zaping a considerable
- * number of page cache entries, which might get refernced once again.
- * But if you are growing the inode set to quickly, you suck up ram
- * and cause other problems.
- *
- * We approach the problem in the following way, we take two things into
- * consideration.  Firstly we take a look at how much we have "committed"
- * to this inode already (i_nrpages), this accounts for the cost of getting
- * those pages back if someone should reference that inode soon.  We also
- * attempt to factor in i_blocks, which says "how much of a problem could
- * this potentially be".  It still needs some tuning though.  -DaveM
- */
-#define BLOCK_FACTOR_SHIFT	5	/* It is not factored in as much. */
-static struct inode *find_best_candidate_weighted(struct inode *inode)
-{
-	struct inode *best = NULL;
+		if(IS_WRITABLE(inode) && sb->dq_op) {
+			/* can operate in parallel to other ops ? */
+			_io(sb->dq_op->drop, inode, 0, ST_TO_DROP);
+			if(atomic_read(&inode->i_count) + inode->i_ddir_count)
+				goto done;
+		}
+	}
+	if(inode->i_mmap)
+		printk("VFS: inode has mappings\n");
+	if(inode->i_status & ST_AGED) {
+		printk("VFS: reaging inode\n");
+#if defined(DEBUG)
+printpath(inode->i_dentry);
+printk("\n");
+#endif
+		goto done;
+	}
+	if(!(inode->i_status & (ST_HASHED|ST_EMPTY))) {
+		_clear_inode(inode, 0, 1);
+		goto done;
+	}
+	if(inode->i_status & ST_EMPTY) {
+		printk("VFS: aging an empty inode\n");
+		goto done;
+	}
+	insert_lru(&aged_i[inode->i_level], inode);
+	inodes_stat.aged_count[inode->i_level]++;
+	inode->i_status |= ST_AGED;
+done:
+	inode->i_status &= ~ST_FREEING;
+}
+
+blocking void _iput(struct inode * inode)
+{
+	vfs_lock();
+	__iput(inode);
+	vfs_unlock();
+}
+
+blocking void sync_inodes(kdev_t dev)
+{
+	struct inode * inode;
+	vfs_lock();
+	inode = all_i;
+	if(inode) do {
+xcheck("sync_inodes",inode);
+		if(inode->i_dirt && (inode->i_dev == dev || !dev)) {
+			if(inode->i_sb && inode->i_sb->s_op &&
+			   !(inode->i_status & ST_FREEING)) {
+				inode->i_dirt = 0; 
+				_io(inode->i_sb->s_op->write_inode, inode,
+				    ST_IO, ST_TO_WRITE);
+			}
+		}
+		inode = inode->i_next;
+	} while(inode != all_i);
+	vfs_unlock();
+}
+
+blocking int _check_inodes(kdev_t dev, int complain)
+{
+	struct inode * inode;
+	int bad = 0;
+
+	vfs_lock();
+startover:
+	inode = all_i;
+	if(inode) do {
+		struct inode * next;
+xcheck("_check_inodes",inode);
+		next = inode->i_next;
+		if(inode->i_dev == dev) {
+			if(inode->i_dirt || atomic_read(&inode->i_count)) {
+				bad++;
+			} else {
+				_clear_inode(inode, 0, 0);
 
-	if(inode) {
-		unsigned long bestscore = 1000;
-		int limit = nr_free_inodes >> 2;
-		do {
-			if(!(inode->i_lock | inode->i_dirt)) {
-				int myscore = inode->i_nrpages;
-
-				myscore += (inode->i_blocks >> BLOCK_FACTOR_SHIFT);
-				if(myscore < bestscore) {
-					bestscore = myscore;
-					best = inode;
-				}
+				/* _clear_inode() may recursively clear other
+				 * inodes, probably also the next one.
+				 */
+				if(next->i_status & ST_EMPTY)
+					goto startover;
 			}
-			inode = inode->i_next;
-		} while(inode && --limit);
-	}
-	return best;
+		}
+		inode = next;
+	} while(inode != all_i);
+	vfs_unlock();
+	if(complain && bad)
+		printk("VFS: %d inode(s) busy on removed device `%s'\n",
+		       bad, kdevname(dev));
+	return (bad == 0);
 }
 
-static inline struct inode *find_best_free(struct inode *inode)
+/*inline*/ void invalidate_inodes(kdev_t dev)
 {
-	if(inode) {
-		int limit = nr_free_inodes >> 5;
-		do {
-			if(!inode->i_nrpages)
-				return inode;
-			inode = inode->i_next;
-		} while(inode && --limit);
-	}
-	return NULL;
+	/* Requires two passes, because of the new dcache holding
+	 * directories with i_count > 1.
+	 */
+	(void)_check_inodes(dev, 0);
+	(void)_check_inodes(dev, 1);
 }
 
-struct inode *get_empty_inode(void)
+/*inline*/ int fs_may_mount(kdev_t dev)
 {
-	static int ino = 0;
-	struct inode *inode;
-
-repeat:
-	inode = find_best_free(free_inodes.head);
-	if(!inode)
-		goto pressure;
-got_it:
-	inode->i_count++;
-	truncate_inode_pages(inode, 0);
-	wait_on_inode(inode);
-	if(IS_WRITABLE(inode) && inode->i_sb && inode->i_sb->dq_op)
-		inode->i_sb->dq_op->drop(inode);
-	unhash_inode(inode);
-	remove_free_inode(inode);
-
-	memset(inode, 0, sizeof(*inode));
-	inode->i_count = 1;
-	inode->i_nlink = 1;
-	inode->i_version = ++event;
-	sema_init(&inode->i_sem, 1);
-	inode->i_ino = ++ino;
-	inode->i_dev = 0;
-	put_inuse(inode);
-	return inode;
-pressure:
-	if(nr_inodes < max_inodes) {
-		grow_inodes();
-		goto repeat;
-	}
-	inode = find_best_candidate_weighted(free_inodes.head);
-	if(!inode) {
-		printk("VFS: No free inodes, contact DaveM\n");
-		sleep_on(&inode_wait);
-		goto repeat;
-	}
-	if(inode->i_lock) {
-		wait_on_inode(inode);
-		goto repeat;
-	} else if(inode->i_dirt) {
-		write_inode(inode);
-		goto repeat;
-	}
-	goto got_it;
+	return _check_inodes(dev, 0);
 }
 
-struct inode *get_pipe_inode(void)
+int fs_may_remount_ro(kdev_t dev)
 {
-	extern struct inode_operations pipe_inode_operations;
-	struct inode *inode = get_empty_inode();
-
-	if(inode) {
-		unsigned long page = __get_free_page(GFP_USER);
-		if(!page) {
-			iput(inode);
-			inode = NULL;
-		} else {
-			PIPE_BASE(*inode) = (char *) page;
-			inode->i_op = &pipe_inode_operations;
-			inode->i_count = 2;
-			PIPE_WAIT(*inode) = NULL;
-			PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
-			PIPE_RD_OPENERS(*inode) = PIPE_WR_OPENERS(*inode) = 0;
-			PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
-			PIPE_LOCK(*inode) = 0;
-			inode->i_pipe = 1;
-			inode->i_mode |= S_IFIFO | S_IRUSR | S_IWUSR;
-			inode->i_uid = current->fsuid;
-			inode->i_gid = current->fsgid;
-			inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-			inode->i_blksize = PAGE_SIZE;
-		}
-	}
-	return inode;
+	(void)dev;
+	return 1; /* not checked any more */
 }
 
-static int inode_updating[INODE_HASHSZ];
-
-struct inode *__iget(struct super_block *sb, int nr, int crossmntp)
+int fs_may_umount(kdev_t dev, struct inode * mount_root)
 {
-	unsigned int hashent = hashfn(sb->s_dev, nr);
-	struct inode *inode, *empty = NULL;
-
-we_slept:
-	if((inode = find_inode(hashent, sb->s_dev, nr)) == NULL) {
-		if(empty == NULL) {
-			inode_updating[hashent]++;
-			empty = get_empty_inode();
-			if(!--inode_updating[hashent])
-				wake_up(&update_wait);
-			goto we_slept;
-		}
-		inode = empty;
-		inode->i_sb = sb;
-		inode->i_dev = sb->s_dev;
-		inode->i_ino = nr;
-		inode->i_flags = sb->s_flags;
-		hash_inode(inode);
-		read_inode(inode);
-	} else {
-		if(!inode->i_count++) {
-			remove_free_inode(inode);
-			put_inuse(inode);
-		}
-		wait_on_inode(inode);
-		if(crossmntp && inode->i_mount) {
-			struct inode *mp = inode->i_mount;
-			mp->i_count++;
-			iput(inode);
-			wait_on_inode(inode = mp);
-		}
-		if(empty)
-			iput(empty);
+	struct inode * inode;
+	vfs_lock();
+	inode = all_i;
+	if(inode) do {
+xcheck("fs_may_umount",inode);
+		if(inode->i_dev == dev && atomic_read(&inode->i_count))
+			if(inode != mount_root || atomic_read(&inode->i_count) > 
+			   (inode->i_mount == inode ? 2 : 1)) {
+				vfs_unlock();
+				return 0;
+			}
+		inode = inode->i_next;
+	} while(inode != all_i);
+	vfs_unlock();
+	return 1;
+}
+
+extern struct inode_operations pipe_inode_operations;
+
+blocking struct inode * get_pipe_inode(void)
+{
+	struct inode * inode = get_empty_inode();
+
+	PIPE_BASE(*inode) = (char*)__get_free_page(GFP_USER);
+	if(!(PIPE_BASE(*inode))) {
+		iput(inode);
+		return NULL;
+	}
+	inode->i_blksize = PAGE_SIZE;
+	inode->i_pipe = 1;
+	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
+	atomic_inc(&inode->i_count);
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &pipe_inode_operations;
+	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
+
+	/* I hope this does not introduce security problems.
+	 * Please check and give me response.
+	 */
+	{
+		char dummyname[32];
+		struct qstr dummy = { dummyname, 0 };
+		struct dentry * new;
+		sprintf(dummyname, ".anonymous-pipe-%06lud", inode->i_ino);
+		dummy.len = strlen(dummyname);
+		vfs_lock();
+		new = d_alloc(the_root, dummy.len, 0);
+		if(new)
+			d_add(new, inode, &dummy, D_BASKET);
+		vfs_unlock();
 	}
-	while(inode_updating[hashent])
-		sleep_on(&update_wait);
 	return inode;
 }
 
-void inode_init(void)
+int bmap(struct inode * inode, int block)
 {
-	int i;
-
-	inode_cachep = kmem_cache_create("inode", sizeof(struct inode),
-					 0,
-					 SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if(!inode_cachep)
-		panic("Cannot create inode SLAB cache\n");
-
-	for(i = 0; i < INODE_HASHSZ; i++)
-		inode_hash[i] = NULL;
+	if (inode->i_op && inode->i_op->bmap)
+		return inode->i_op->bmap(inode, block);
+	return 0;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov