From: Hugh Dickins <hugh@veritas.com>

Kevin P.  Fleming pointed out that the 2.6 tmpfs does not allow writing huge
sparse files.  This is an unintended side-effect of the strict memory commit
changes: which should make no

The solution is to treat the tmpfs files (of variable size) and the shmem
objects (of fixed size) differently: sounds nasty but works out well.  The
shmem objects follow the VM preallocation convention as before, but the tmpfs
files revert to allocation on demand as a filesystem would.  If there's not
enough memory to write to a tmpfs hole, it is reported as -ENOSPC rather than
-ENOMEM, so the mmap writer gets SIGBUS rather than everyone else getting
OOM-killed.


---

 25-akpm/mm/shmem.c |   95 +++++++++++++++++++++++++++--------------------------
 1 files changed, 50 insertions(+), 45 deletions(-)

diff -puN mm/shmem.c~huge-sparse-tmpfs-files mm/shmem.c
--- 25/mm/shmem.c~huge-sparse-tmpfs-files	2004-04-03 03:00:05.811348096 -0800
+++ 25-akpm/mm/shmem.c	2004-04-03 03:00:05.817347184 -0800
@@ -123,6 +123,42 @@ static inline struct shmem_sb_info *SHME
 	return sb->s_fs_info;
 }
 
+/*
+ * shmem_file_setup pre-accounts the whole fixed size of a VM object,
+ * for shared memory and for shared anonymous (/dev/zero) mappings
+ * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
+ * consistent with the pre-accounting of private mappings ...
+ */
+static inline int shmem_acct_size(unsigned long flags, loff_t size)
+{
+	return (flags & VM_ACCOUNT)?
+		security_vm_enough_memory(VM_ACCT(size)): 0;
+}
+
+static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+{
+	if (flags & VM_ACCOUNT)
+		vm_unacct_memory(VM_ACCT(size));
+}
+
+/*
+ * ... whereas tmpfs objects are accounted incrementally as
+ * pages are allocated, in order to allow huge sparse files.
+ * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
+ */
+static inline int shmem_acct_block(unsigned long flags)
+{
+	return (flags & VM_ACCOUNT)?
+		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+}
+
+static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+{
+	if (!(flags & VM_ACCOUNT))
+		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+}
+
 static struct super_operations shmem_ops;
 static struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
@@ -173,6 +209,7 @@ static void shmem_recalc_inode(struct in
 		sbinfo->free_blocks += freed;
 		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
 		spin_unlock(&sbinfo->stat_lock);
+		shmem_unacct_blocks(info->flags, freed);
 	}
 }
 
@@ -456,7 +493,7 @@ static void shmem_truncate(struct inode 
 			shmem_dir_unmap(dir);
 			if (empty) {
 				shmem_dir_free(empty);
-				info->alloced++;
+				shmem_free_block(inode);
 			}
 			empty = subdir;
 			cond_resched_lock(&info->lock);
@@ -479,19 +516,19 @@ static void shmem_truncate(struct inode 
 		else if (subdir) {
 			*dir = NULL;
 			shmem_dir_free(subdir);
-			info->alloced++;
+			shmem_free_block(inode);
 		}
 	}
 done1:
 	shmem_dir_unmap(dir-1);
 	if (empty) {
 		shmem_dir_free(empty);
-		info->alloced++;
+		shmem_free_block(inode);
 	}
 	if (info->next_index <= SHMEM_NR_DIRECT) {
 		shmem_dir_free(info->i_indirect);
 		info->i_indirect = NULL;
-		info->alloced++;
+		shmem_free_block(inode);
 	}
 done2:
 	BUG_ON(info->swapped > info->next_index);
@@ -516,20 +553,10 @@ static int shmem_notify_change(struct de
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page = NULL;
-	long change = 0;
 	int error;
 
-	if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) {
-		/*
-	 	 * Account swap file usage based on new file size,
-		 * but just let vmtruncate fail on out-of-range sizes.
-	 	 */
-		change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size);
-		if (change > 0) {
-			if (security_vm_enough_memory(change))
-				return -ENOMEM;
-		} else if (attr->ia_size < inode->i_size) {
-			vm_unacct_memory(-change);
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size < inode->i_size) {
 			/*
 			 * If truncating down to a partial page, then
 			 * if that page is already allocated, hold it
@@ -563,8 +590,6 @@ static int shmem_notify_change(struct de
 		error = inode_setattr(inode, attr);
 	if (page)
 		page_cache_release(page);
-	if (error)
-		vm_unacct_memory(change);
 	return error;
 }
 
@@ -577,8 +602,7 @@ static void shmem_delete_inode(struct in
 		spin_lock(&shmem_ilock);
 		list_del(&info->list);
 		spin_unlock(&shmem_ilock);
-		if (info->flags & VM_ACCOUNT)
-			vm_unacct_memory(VM_ACCT(inode->i_size));
+		shmem_unacct_size(info->flags, inode->i_size);
 		inode->i_size = 0;
 		shmem_truncate(inode);
 	}
@@ -909,7 +933,7 @@ repeat:
 		shmem_swp_unmap(entry);
 		sbinfo = SHMEM_SB(inode->i_sb);
 		spin_lock(&sbinfo->stat_lock);
-		if (sbinfo->free_blocks == 0) {
+		if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) {
 			spin_unlock(&sbinfo->stat_lock);
 			spin_unlock(&info->lock);
 			error = -ENOSPC;
@@ -923,6 +947,7 @@ repeat:
 			spin_unlock(&info->lock);
 			filepage = page_cache_alloc(mapping);
 			if (!filepage) {
+				shmem_unacct_blocks(info->flags, 1);
 				shmem_free_block(inode);
 				error = -ENOMEM;
 				goto failed;
@@ -940,6 +965,7 @@ repeat:
 					filepage, mapping, idx, GFP_ATOMIC)) {
 				spin_unlock(&info->lock);
 				page_cache_release(filepage);
+				shmem_unacct_blocks(info->flags, 1);
 				shmem_free_block(inode);
 				filepage = NULL;
 				if (error)
@@ -1094,7 +1120,6 @@ shmem_get_inode(struct super_block *sb, 
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
-		info->flags = VM_ACCOUNT;
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -1167,7 +1192,6 @@ shmem_file_write(struct file *file, cons
 	loff_t		pos;
 	unsigned long	written;
 	int		err;
-	loff_t		maxpos;
 
 	if ((ssize_t) count < 0)
 		return -EINVAL;
@@ -1184,15 +1208,6 @@ shmem_file_write(struct file *file, cons
 	if (err || !count)
 		goto out;
 
-	maxpos = inode->i_size;
-	if (maxpos < pos + count) {
-		maxpos = pos + count;
-		if (security_vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
 	err = remove_suid(file->f_dentry);
 	if (err)
 		goto out;
@@ -1267,10 +1282,6 @@ shmem_file_write(struct file *file, cons
 	*ppos = pos;
 	if (written)
 		err = written;
-
-	/* Short writes give back address space */
-	if (inode->i_size != maxpos)
-		vm_unacct_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size));
 out:
 	up(&inode->i_sem);
 	return err;
@@ -1551,13 +1562,8 @@ static int shmem_symlink(struct inode *d
 		memcpy(info, symname, len);
 		inode->i_op = &shmem_symlink_inline_operations;
 	} else {
-		if (security_vm_enough_memory(VM_ACCT(1))) {
-			iput(inode);
-			return -ENOMEM;
-		}
 		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
 		if (error) {
-			vm_unacct_memory(VM_ACCT(1));
 			iput(inode);
 			return error;
 		}
@@ -1947,7 +1953,7 @@ struct file *shmem_file_setup(char *name
 	if (size > SHMEM_MAX_BYTES)
 		return ERR_PTR(-EINVAL);
 
-	if ((flags & VM_ACCOUNT) && security_vm_enough_memory(VM_ACCT(size)))
+	if (shmem_acct_size(flags, size))
 		return ERR_PTR(-ENOMEM);
 
 	error = -ENOMEM;
@@ -1969,7 +1975,7 @@ struct file *shmem_file_setup(char *name
 	if (!inode)
 		goto close_file;
 
-	SHMEM_I(inode)->flags &= flags;
+	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
@@ -1985,8 +1991,7 @@ close_file:
 put_dentry:
 	dput(dentry);
 put_memory:
-	if (flags & VM_ACCOUNT)
-		vm_unacct_memory(VM_ACCT(size));
+	shmem_unacct_size(flags, size);
 	return ERR_PTR(error);
 }
 

_