From: Steve Longerbeam <stevel@mvista.com>

The patch allows NUMA policies for file mappings.  Page cache pages are
allocated using a policy located in a shared_policy Red-Black tree attached
to the mapping object (address_space).  This involves:

1. add a shared_policy tree to the address_space object in fs.h.

2. modify page_cache_alloc() in pagemap.h to take a page index in
   addition to a mapping object, and use those to locate the correct policy
   in the mapping->policy tree when allocating the page.

3. modify filemap.c to pass the additional page offset to page_cache_alloc().

4. Also in filemap.c, implement generic file {set|get}_policy() methods and
   add those to generic_file_vm_ops.

5. Init the file's shared policy in alloc_inode(), and free the shared policy in
  destroy_inode().

In addition, the patch adds a new flag to the mbind() syscall,
MPOL_MF_MOVE.  If the flag is set, any existing mapped anonymous or filemap
pagecache pages that are/can be used for the given virtual memory region,
that do not satisfy the NUMA policy, are moved to a new page that satisfies
the policy.  Here's how the new flag works with the existing MPOL_MF_STRICT
flag (in the following discussion, "invalid page" means a page that does
not satisfy the NUMA policy):

MOVE and STRICT both set:
attempt to move invalid pages, and if any move fails, return mbind()
syscall failure.

MOVE set:
attempt to move invalid pages, but do not return error if any move fails.

STRICT set:
do not attempt to move invalid pages, returning mbind() failure (same
behavior as before).

neither MOVE or STRICT set:
ignore invalid pages.

In the default !NUMA case, there are no additional CPU cycles involved.  Well,
there is the additional page index passed to page_cache_alloc() and friends,
but the extra cycles there is very small.

In the NUMA case, there is of course extra processing if MPOL_MF_MOVE is
passed to the mbind() syscall.  Ie, it loops through every page index of the
new region, looking for any invalid existing pte-mapped and pagecache pages,
and replaces them.  But this is done at syscall time, so it's not time
critical.  It could take a while for huge mappings, but only mm->mmap_sem is
held during the whole search, and mm->page_table_lock is held only long enough
to replace a single invalid pte-mapped page if one is found.

It adds a few words to struct inode if CONFIG_NUMA is set.

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 /dev/null                          |    0 
 25-akpm/fs/inode.c                 |    5 
 25-akpm/include/linux/fs.h         |    2 
 25-akpm/include/linux/mempolicy.h  |   12 +
 25-akpm/include/linux/page-flags.h |    6 
 25-akpm/include/linux/pagemap.h    |   18 +
 25-akpm/mm/filemap.c               |   39 +++-
 25-akpm/mm/mempolicy.c             |  354 +++++++++++++++++++++++++++++--------
 25-akpm/mm/readahead.c             |    2 
 25-akpm/mm/shmem.c                 |   11 -
 10 files changed, 357 insertions(+), 92 deletions(-)

diff -L fs/cachefs/block.c -puN /dev/null /dev/null
diff -puN fs/inode.c~numa-policies-for-file-mappings-mpol_mf_move fs/inode.c
--- 25/fs/inode.c~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/fs/inode.c	Wed Nov 17 14:12:37 2004
@@ -150,6 +150,7 @@ static struct inode *alloc_inode(struct 
 		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+ 		mpol_shared_policy_init(&mapping->policy);
 
 		/*
 		 * If the block_device provides a backing_dev_info for client
@@ -177,8 +178,10 @@ void destroy_inode(struct inode *inode) 
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
-	else
+	else {
+		mpol_free_shared_policy(&inode->i_mapping->policy);
 		kmem_cache_free(inode_cachep, (inode));
+	}
 }
 
 
diff -puN include/linux/fs.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/fs.h
--- 25/include/linux/fs.h~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/include/linux/fs.h	Wed Nov 17 14:12:37 2004
@@ -18,6 +18,7 @@
 #include <linux/cache.h>
 #include <linux/prio_tree.h>
 #include <linux/kobject.h>
+#include <linux/mempolicy.h>
 #include <asm/atomic.h>
 
 struct iovec;
@@ -349,6 +350,7 @@ struct address_space {
 	struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
+	struct shared_policy    policy;         /* page alloc policy */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
diff -puN include/linux/mempolicy.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/mempolicy.h
--- 25/include/linux/mempolicy.h~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/include/linux/mempolicy.h	Wed Nov 17 14:12:37 2004
@@ -22,6 +22,8 @@
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE	(1<<1)	/* Attempt to move pages in mapping that do
+				   not satisfy policy */
 
 #ifdef __KERNEL__
 
@@ -149,7 +151,8 @@ int mpol_set_shared_policy(struct shared
 void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
-
+struct page *alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp,
+				      unsigned long idx);
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 
@@ -215,6 +218,13 @@ mpol_shared_policy_lookup(struct shared_
 #define vma_policy(vma) NULL
 #define vma_set_policy(vma, pol) do {} while(0)
 
+static inline struct page *
+alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp,
+			 unsigned long idx)
+{
+	return alloc_pages(gfp, 0);
+}
+
 static inline void numa_policy_init(void)
 {
 }
diff -puN include/linux/page-flags.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/page-flags.h
--- 25/include/linux/page-flags.h~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/include/linux/page-flags.h	Wed Nov 17 14:12:37 2004
@@ -74,6 +74,8 @@
 #define PG_swapcache		16	/* Swap page: swp_entry_t in private */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
+#define PG_sharedpolicy         19      /* Page was allocated for a file
+					   mapping using a shared_policy */
 
 
 /*
@@ -290,6 +292,10 @@ extern unsigned long __read_page_state(u
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
+#define PageSharedPolicy(page)      test_bit(PG_sharedpolicy, &(page)->flags)
+#define SetPageSharedPolicy(page)   set_bit(PG_sharedpolicy, &(page)->flags)
+#define ClearPageSharedPolicy(page) clear_bit(PG_sharedpolicy, &(page)->flags)
+
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
 #define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
diff -puN include/linux/pagemap.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/pagemap.h
--- 25/include/linux/pagemap.h~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/include/linux/pagemap.h	Wed Nov 17 14:12:37 2004
@@ -50,14 +50,24 @@ static inline void mapping_set_gfp_mask(
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
-static inline struct page *page_cache_alloc(struct address_space *x)
+
+static inline struct page *__page_cache_alloc(struct address_space *x,
+					      unsigned long idx,
+					      unsigned int gfp_mask)
+{
+	return alloc_page_shared_policy(gfp_mask, &x->policy, idx);
+}
+
+static inline struct page *page_cache_alloc(struct address_space *x,
+					    unsigned long idx)
 {
-	return alloc_pages(mapping_gfp_mask(x), 0);
+	return __page_cache_alloc(x, idx, mapping_gfp_mask(x));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
+static inline struct page *page_cache_alloc_cold(struct address_space *x,
+						 unsigned long idx)
 {
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+	return __page_cache_alloc(x, idx, mapping_gfp_mask(x)|__GFP_COLD);
 }
 
 typedef int filler_t(void *, struct page *);
diff -puN mm/filemap.c~numa-policies-for-file-mappings-mpol_mf_move mm/filemap.c
--- 25/mm/filemap.c~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/mm/filemap.c	Wed Nov 17 14:12:37 2004
@@ -566,7 +566,8 @@ repeat:
 	page = find_lock_page(mapping, index);
 	if (!page) {
 		if (!cached_page) {
-			cached_page = alloc_page(gfp_mask);
+			cached_page = __page_cache_alloc(mapping, index,
+							 gfp_mask);
 			if (!cached_page)
 				return NULL;
 		}
@@ -659,7 +660,7 @@ grab_cache_page_nowait(struct address_sp
 		return NULL;
 	}
 	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
-	page = alloc_pages(gfp_mask, 0);
+	page = __page_cache_alloc(mapping, index, gfp_mask);
 	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
 		page_cache_release(page);
 		page = NULL;
@@ -836,7 +837,7 @@ no_cached_page:
 		 * page..
 		 */
 		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
+			cached_page = page_cache_alloc_cold(mapping, index);
 			if (!cached_page) {
 				desc->error = -ENOMEM;
 				goto out;
@@ -1099,7 +1100,7 @@ static int fastcall page_cache_read(stru
 	struct page *page; 
 	int error;
 
-	page = page_cache_alloc_cold(mapping);
+	page = page_cache_alloc_cold(mapping, offset);
 	if (!page)
 		return -ENOMEM;
 
@@ -1481,9 +1482,35 @@ repeat:
 	return 0;
 }
 
+
+#ifdef CONFIG_NUMA
+int generic_file_set_policy(struct vm_area_struct *vma,
+			    struct mempolicy *new)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	return mpol_set_shared_policy(&mapping->policy, vma, new);
+}
+
+struct mempolicy *
+generic_file_get_policy(struct vm_area_struct *vma,
+			unsigned long addr)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long idx;
+
+	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	return mpol_shared_policy_lookup(&mapping->policy, idx);
+}
+#endif
+
+
 struct vm_operations_struct generic_file_vm_ops = {
 	.nopage		= filemap_nopage,
 	.populate	= filemap_populate,
+#ifdef CONFIG_NUMA
+	.set_policy     = generic_file_set_policy,
+	.get_policy     = generic_file_get_policy,
+#endif
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1533,7 +1560,7 @@ repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
 		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
+			cached_page = page_cache_alloc_cold(mapping, index);
 			if (!cached_page)
 				return ERR_PTR(-ENOMEM);
 		}
@@ -1615,7 +1642,7 @@ repeat:
 	page = find_lock_page(mapping, index);
 	if (!page) {
 		if (!*cached_page) {
-			*cached_page = page_cache_alloc(mapping);
+			*cached_page = page_cache_alloc(mapping, index);
 			if (!*cached_page)
 				return NULL;
 		}
diff -puN mm/mempolicy.c~numa-policies-for-file-mappings-mpol_mf_move mm/mempolicy.c
--- 25/mm/mempolicy.c~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/mm/mempolicy.c	Wed Nov 17 14:12:54 2004
@@ -2,6 +2,7 @@
  * Simple NUMA memory policy for the Linux kernel.
  *
  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * Copyright 2004 Steve Longerbeam, MontaVista Software.
  * Subject to the GNU Public License, version 2.
  *
  * NUMA policy allows the user to give hints in which node(s) memory should
@@ -47,15 +48,28 @@
  */
 
 /* Notebook:
-   fix mmap readahead to honour policy and enable policy for any page cache
-   object
-   statistics for bigpages
-   global policy for page cache? currently it uses process policy. Requires
-   first item above.
+   Page cache pages can now be policied, by adding a shared_policy tree to
+   inodes (actually located in address_space). One entry in the tree for
+   each mapped region of a file. Generic files now have set_policy and
+   get_policy methods in generic_file_vm_ops [stevel].
+
+   Added a page-move feature, whereby existing pte-mapped or filemap
+   pagecache pages that are/can be mapped to the given virtual memory
+   region, that do not satisfy the NUMA policy, are moved to a new
+   page that satisfies the policy. Enabled by the new mbind flag
+   MPOL_MF_MOVE [stevel].
+
+   statistics for bigpages.
+
+   global policy for page cache? currently it uses per-file policies in
+   address_space (see first item above).
+
    handle mremap for shared memory (currently ignored for the policy)
    grows down?
+
    make bind policy root only? It can trigger oom much faster and the
    kernel is not always grateful with that.
+
    could replace all the switch()es with a mempolicy_ops structure.
 */
 
@@ -66,6 +80,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/nodemask.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
@@ -75,6 +90,9 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
@@ -232,33 +250,225 @@ static struct mempolicy *mpol_new(int mo
 	return policy;
 }
 
-/* Ensure all existing pages follow the policy. */
+
+/* Return effective policy for a VMA */
+static struct mempolicy *
+get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = current->mempolicy;
+
+	if (vma) {
+		if (vma->vm_ops && vma->vm_ops->get_policy)
+		        pol = vma->vm_ops->get_policy(vma, addr);
+		else if (vma->vm_policy &&
+				vma->vm_policy->policy != MPOL_DEFAULT)
+			pol = vma->vm_policy;
+	}
+	if (!pol)
+		pol = &default_policy;
+	return pol;
+}
+
+
+/* Find secondary valid nodes for an allocation */
+static int __mpol_node_valid(int nid, struct mempolicy *pol)
+{
+	switch (pol->policy) {
+	case MPOL_PREFERRED:
+	case MPOL_DEFAULT:
+	case MPOL_INTERLEAVE:
+		return 1;
+	case MPOL_BIND: {
+		struct zone **z;
+		for (z = pol->v.zonelist->zones; *z; z++)
+			if ((*z)->zone_pgdat->node_id == nid)
+				return 1;
+		return 0;
+	}
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
+{
+	return __mpol_node_valid(nid, get_vma_policy(vma, addr));
+}
+
+/*
+ * The given page doesn't match a file mapped VMA's policy. If the
+ * page is unused, remove it from the page cache, so that a new page
+ * can be later reallocated to the cache using the correct policy.
+ * Returns 0 if the page was removed from the cache, < 0 if failed.
+ *
+ * We use invalidate_mapping_pages(), which doesn't try very hard.
+ * It won't remove pages which are locked (won't wait for a lock),
+ * dirty, under writeback, or mapped by pte's. All the latter are
+ * valid checks for us, but we might be able to improve our success
+ * by waiting for a lock.
+ */
+static int
+remove_invalid_filemap_page(struct page * page,
+			    struct vm_area_struct *vma,
+			    pgoff_t pgoff)
+{
+	/*
+	 * the page in the cache is not in any of the nodes this
+	 * VMA's policy wants it to be in. Can we remove it?
+	 */
+	if (!PageSharedPolicy(page) &&
+	    invalidate_mapping_pages(vma->vm_file->f_mapping,
+				     pgoff, pgoff) > 0) {
+		PDprintk("removed cache page in node %ld, "
+			 "pgoff=%lu, for %s\n",
+			 page_to_nid(page), pgoff,
+			 vma->vm_file->f_dentry->d_name.name);
+		return 0;
+	}
+
+	/*
+	 * the page is being used by other pagetable mappings,
+	 * or is currently locked, dirty, or under writeback.
+	 */
+	PDprintk("could not remove cache page in node %ld, "
+		 "pgoff=%lu, for %s\n",
+		 page_to_nid(page), pgoff,
+		 vma->vm_file->f_dentry->d_name.name);
+	return -EIO;
+}
+
+/*
+ * The given page doesn't match a VMA's policy. Allocate a new
+ * page using the policy, copy contents from old to new, free
+ * the old page, map in the new page. This looks a lot like a COW.
+ */
+static int
+move_invalid_page(struct page * page, struct mempolicy *pol,
+		  struct vm_area_struct *vma, unsigned long addr,
+		  pmd_t *pmd)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page * new_page;
+	struct vm_area_struct pvma;
+	pte_t *page_table;
+	pte_t entry;
+
+	PDprintk("moving anon page in node %ld, address=%08lx\n",
+		 page_to_nid(page), addr);
+
+	if (!PageReserved(page))
+		page_cache_get(page);
+	spin_unlock(&mm->page_table_lock);
+	if (unlikely(anon_vma_prepare(vma)))
+		goto err_no_mem;
+
+	/* Create a pseudo vma that just contains the policy */
+	memset(&pvma, 0, sizeof(struct vm_area_struct));
+	pvma.vm_end = PAGE_SIZE;
+	pvma.vm_pgoff = vma->vm_pgoff;
+	pvma.vm_policy = pol;
+	new_page = alloc_page_vma(GFP_HIGHUSER, &pvma, addr);
+	if (!new_page)
+		goto err_no_mem;
+
+	copy_user_highpage(new_page, page, addr);
+
+	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, addr);
+	if (!PageReserved(page))
+		page_remove_rmap(page);
+
+	flush_cache_page(vma, addr);
+	entry = pte_mkdirty(mk_pte(new_page, vma->vm_page_prot));
+	if (likely(vma->vm_flags & VM_WRITE))
+		entry = pte_mkwrite(entry);
+	ptep_establish(vma, addr, page_table, entry);
+	update_mmu_cache(vma, addr, entry);
+	lru_cache_add_active(new_page);
+	page_add_anon_rmap(new_page, vma, addr);
+
+	pte_unmap(page_table);
+	page_cache_release(page); /* release our ref on the old page */
+	page_cache_release(page); /* release our pte ref on the old page */
+	return 0;
+
+ err_no_mem:
+	spin_lock(&mm->page_table_lock);
+	return -ENOMEM;
+}
+
+/* Ensure all existing pages in a VMA follow the policy. */
 static int
-verify_pages(struct mm_struct *mm,
-	     unsigned long addr, unsigned long end, unsigned long *nodes)
+move_verify_pages(struct vm_area_struct *vma, struct mempolicy *pol,
+		  unsigned long flags)
 {
-	while (addr < end) {
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr;
+	unsigned long start = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	if (!(flags & (MPOL_MF_MOVE | MPOL_MF_STRICT)))
+		return 0;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		struct page *p;
 		pte_t *pte;
 		pmd_t *pmd;
 		pgd_t *pgd;
 		pml4_t *pml4;
+		int err;
+
+		/*
+		 * first, if this is a file mapping and we are moving pages,
+		 * check for invalid page cache pages, and if they are unused,
+		 * remove.
+		 */
+		if (vma->vm_ops && vma->vm_ops->nopage) {
+			struct address_space *mapping =
+				vma->vm_file->f_mapping;
+			unsigned long pgoff =
+				((addr - vma->vm_start) >> PAGE_CACHE_SHIFT) +
+				vma->vm_pgoff;
+
+			p = find_get_page(mapping, pgoff);
+			if (p) {
+				err = 0;
+				if (!__mpol_node_valid(page_to_nid(p), pol)) {
+					if (!(flags & MPOL_MF_MOVE))
+						err = -EIO;
+					else
+						err = remove_invalid_filemap_page(
+							p,vma,pgoff);
+				}
+				page_cache_release(p);  /* find_get_page */
+				if (err && (flags & MPOL_MF_STRICT))
+					return err;
+			}
+		}
+
+		/*
+		 * Now let's see if there is a pte-mapped page that doesn't
+		 * satisfy the policy. Because of the above, we can be sure
+		 * from here that, if there is a VMA page that's pte-mapped
+		 * and it belongs to the page cache, it either satisfies the
+		 * policy, or we don't mind if it doesn't (MF_STRICT not set).
+		 */
+		spin_lock(&mm->page_table_lock);
 		pml4 = pml4_offset(mm, addr);
 		if (pml4_none(*pml4)) {
-			unsigned long next = (addr + PML4_SIZE) & PML4_MASK;
-			if (next > addr)
-				break;
-			addr = next;
+			spin_unlock(&mm->page_table_lock);
 			continue;
 		}
 		pgd = pml4_pgd_offset(pml4, addr);
+
 		if (pgd_none(*pgd)) {
-			addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+			spin_unlock(&mm->page_table_lock);
 			continue;
 		}
 		pmd = pmd_offset(pgd, addr);
 		if (pmd_none(*pmd)) {
-			addr = (addr + PMD_SIZE) & PMD_MASK;
+			spin_unlock(&mm->page_table_lock);
 			continue;
 		}
 		p = NULL;
@@ -267,19 +477,29 @@ verify_pages(struct mm_struct *mm,
 			p = pte_page(*pte);
 		pte_unmap(pte);
 		if (p) {
-			unsigned nid = page_to_nid(p);
-			if (!test_bit(nid, nodes))
-				return -EIO;
+			err = 0;
+			if (!__mpol_node_valid(page_to_nid(p), pol)) {
+				if (!(flags & MPOL_MF_MOVE))
+					err = -EIO;
+				else
+					err = move_invalid_page(p, pol, vma,
+								addr, pmd);
+			}
+			if (err && (flags & MPOL_MF_STRICT)) {
+				spin_unlock(&mm->page_table_lock);
+				return err;
+			}
 		}
-		addr += PAGE_SIZE;
+		spin_unlock(&mm->page_table_lock);
 	}
+
 	return 0;
 }
 
 /* Step 1: check the range */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    unsigned long *nodes, unsigned long flags)
+	    struct mempolicy *policy, unsigned long flags)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -293,9 +513,8 @@ check_range(struct mm_struct *mm, unsign
 			return ERR_PTR(-EFAULT);
 		if (prev && prev->vm_end < vma->vm_start)
 			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
-			err = verify_pages(vma->vm_mm,
-					   vma->vm_start, vma->vm_end, nodes);
+		if (flags & (MPOL_MF_MOVE | MPOL_MF_STRICT)) {
+			err = move_verify_pages(vma, policy, flags);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -362,12 +581,13 @@ asmlinkage long sys_mbind(unsigned long 
 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
 	int err;
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT | MPOL_MF_MOVE)) ||
+	    mode > MPOL_MAX)
 		return -EINVAL;
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
 	if (mode == MPOL_DEFAULT)
-		flags &= ~MPOL_MF_STRICT;
+		flags &= ~(MPOL_MF_STRICT | MPOL_MF_MOVE);
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
 	if (end < start)
@@ -387,7 +607,7 @@ asmlinkage long sys_mbind(unsigned long 
 			mode,nodes[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nodes, flags);
+	vma = check_range(mm, start, end, new, flags);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma))
 		err = mbind_range(vma, start, end, new);
@@ -620,24 +840,6 @@ asmlinkage long compat_sys_mbind(compat_
 
 #endif
 
-/* Return effective policy for a VMA */
-static struct mempolicy *
-get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = current->mempolicy;
-
-	if (vma) {
-		if (vma->vm_ops && vma->vm_ops->get_policy)
-		        pol = vma->vm_ops->get_policy(vma, addr);
-		else if (vma->vm_policy &&
-				vma->vm_policy->policy != MPOL_DEFAULT)
-			pol = vma->vm_policy;
-	}
-	if (!pol)
-		pol = &default_policy;
-	return pol;
-}
-
 /* Return a zonelist representing a mempolicy */
 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
 {
@@ -872,28 +1074,6 @@ int mpol_first_node(struct vm_area_struc
 	return 0;
 }
 
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-	case MPOL_DEFAULT:
-	case MPOL_INTERLEAVE:
-		return 1;
-	case MPOL_BIND: {
-		struct zone **z;
-		for (z = pol->v.zonelist->zones; *z; z++)
-			if ((*z)->zone_pgdat->node_id == nid)
-				return 1;
-		return 0;
-	}
-	default:
-		BUG();
-		return 0;
-	}
-}
 
 /*
  * Shared memory backing store policy support.
@@ -1013,10 +1193,14 @@ restart:
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
 		struct rb_node *next = rb_next(&n->nd);
-		if (n->start >= start) {
-			if (n->end <= end)
+		if (n->start == start && n->end == end &&
+		    mpol_equal(n->policy, new->policy)) {
+			/* the same shared policy already exists, just exit */
+			goto out;
+		} else if (n->start >= start) {
+			if (n->end <= end) {
 				sp_delete(sp, n);
-			else
+			} else
 				n->start = end;
 		} else {
 			/* Old policy spanning whole new range. */
@@ -1042,6 +1226,7 @@ restart:
 	}
 	if (new)
 		sp_insert(sp, new);
+ out:
 	spin_unlock(&sp->lock);
 	if (new2) {
 		mpol_free(new2->policy);
@@ -1093,6 +1278,37 @@ void mpol_free_shared_policy(struct shar
 	spin_unlock(&p->lock);
 }
 
+struct page *
+alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp,
+			 unsigned long idx)
+{
+	struct page *page;
+	struct mempolicy * shared_pol = NULL;
+
+	if (sp->root.rb_node) {
+		struct vm_area_struct pvma;
+		/* Create a pseudo vma that just contains the policy */
+		memset(&pvma, 0, sizeof(struct vm_area_struct));
+		pvma.vm_end = PAGE_SIZE;
+		pvma.vm_pgoff = idx;
+		shared_pol = mpol_shared_policy_lookup(sp, idx);
+		pvma.vm_policy = shared_pol;
+		page = alloc_page_vma(gfp, &pvma, 0);
+		mpol_free(pvma.vm_policy);
+	} else {
+		page = alloc_pages(gfp, 0);
+	}
+
+	if (page) {
+		if (shared_pol)
+			SetPageSharedPolicy(page);
+		else
+			ClearPageSharedPolicy(page);
+	}
+
+	return page;
+}
+
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
diff -puN mm/readahead.c~numa-policies-for-file-mappings-mpol_mf_move mm/readahead.c
--- 25/mm/readahead.c~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/mm/readahead.c	Wed Nov 17 14:12:37 2004
@@ -245,7 +245,7 @@ __do_page_cache_readahead(struct address
 			continue;
 
 		spin_unlock_irq(&mapping->tree_lock);
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc_cold(mapping, page_offset);
 		spin_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
diff -puN mm/shmem.c~numa-policies-for-file-mappings-mpol_mf_move mm/shmem.c
--- 25/mm/shmem.c~numa-policies-for-file-mappings-mpol_mf_move	Wed Nov 17 14:12:37 2004
+++ 25-akpm/mm/shmem.c	Wed Nov 17 14:12:37 2004
@@ -903,16 +903,7 @@ static struct page *
 shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
 		 unsigned long idx)
 {
-	struct vm_area_struct pvma;
-	struct page *page;
-
-	memset(&pvma, 0, sizeof(struct vm_area_struct));
-	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
-	pvma.vm_pgoff = idx;
-	pvma.vm_end = PAGE_SIZE;
-	page = alloc_page_vma(gfp, &pvma, 0);
-	mpol_free(pvma.vm_policy);
-	return page;
+	return alloc_page_shared_policy(gfp, &info->policy, idx);
 }
 #else
 static inline struct page *
_