patch-2.2.0-pre5 linux/mm/vmscan.c

Next file: linux/net/TUNABLE
Previous file: linux/mm/page_alloc.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.2.0-pre4/linux/mm/vmscan.c linux/mm/vmscan.c
@@ -20,13 +20,6 @@
 
 #include <asm/pgtable.h>
 
-/* 
- * The wait queue for waking up the pageout daemon:
- */
-static struct task_struct * kswapd_task = NULL;
-
-static void init_swap_timer(void);
-
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -38,7 +31,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
+static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
 	unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
@@ -59,50 +52,6 @@
 	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
 		return 0;
 
-	/* 
-	 * Deal with page aging.  There are several special cases to
-	 * consider:
-	 * 
-	 * Page has been accessed, but is swap cached.  If the page is
-	 * getting sufficiently "interesting" --- its age is getting
-	 * high --- then if we are sufficiently short of free swap
-	 * pages, then delete the swap cache.  We can only do this if
-	 * the swap page's reference count is one: ie. there are no
-	 * other references to it beyond the swap cache (as there must
-	 * still be PTEs pointing to it if count > 1).
-	 * 
-	 * If the page has NOT been touched, and its age reaches zero,
-	 * then we are swapping it out:
-	 *
-	 *   If there is already a swap cache page for this page, then
-	 *   another process has already allocated swap space, so just
-	 *   dereference the physical page and copy in the swap entry
-	 *   from the swap cache.  
-	 * 
-	 * Note, we rely on all pages read in from swap either having
-	 * the swap cache flag set, OR being marked writable in the pte,
-	 * but NEVER BOTH.  (It IS legal to be neither cached nor dirty,
-	 * however.)
-	 *
-	 * -- Stephen Tweedie 1998 */
-
-	if (PageSwapCache(page_map)) {
-		if (pte_write(pte)) {
-			struct page *found;
-			printk ("VM: Found a writable swap-cached page!\n");
-			/* Try to diagnose the problem ... */
-			found = find_page(&swapper_inode, page_map->offset);
-			if (found) {
-				printk("page=%p@%08lx, found=%p, count=%d\n",
-					page_map, page_map->offset,
-					found, atomic_read(&found->count));
-				__free_page(found);
-			} else 
-				printk ("Spurious, page not in cache\n");
-			return 0;
-		}
-	}
-	
 	if (pte_young(pte)) {
 		/*
 		 * Transfer the "accessed" bit from the page
@@ -110,109 +59,101 @@
 		 */
 		set_pte(page_table, pte_mkold(pte));
 		set_bit(PG_referenced, &page_map->flags);
-
-		/* 
-		 * We should test here to see if we want to recover any
-		 * swap cache page here.  We do this if the page seeing
-		 * enough activity, AND we are sufficiently low on swap
-		 *
-		 * We need to track both the number of available swap
-		 * pages and the total number present before we can do
-		 * this...  
-		 */
 		return 0;
 	}
 
-	if (pte_dirty(pte)) {
-		if (vma->vm_ops && vma->vm_ops->swapout) {
-			pid_t pid = tsk->pid;
-			vma->vm_mm->rss--;
-			if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
-				kill_proc(pid, SIGBUS, 1);
-		} else {
-			/*
-			 * This is a dirty, swappable page.  First of all,
-			 * get a suitable swap entry for it, and make sure
-			 * we have the swap cache set up to associate the
-			 * page with that swap entry.
-			 */
-        		entry = in_swap_cache(page_map);
-			if (!entry) {
-				entry = get_swap_page();
-				if (!entry)
-					return 0; /* No swap space left */
-			}
-			
-			vma->vm_mm->rss--;
-			tsk->nswap++;
-			flush_cache_page(vma, address);
-			set_pte(page_table, __pte(entry));
-			flush_tlb_page(vma, address);
-			swap_duplicate(entry);
-
-			/* Now to write back the page.  We have two
-			 * cases: if the page is already part of the
-			 * swap cache, then it is already on disk.  Just
-			 * free the page and return (we release the swap
-			 * cache on the last accessor too).
-			 *
-			 * If we have made a new swap entry, then we
-			 * start the write out to disk.  If the page is
-			 * shared, however, we still need to keep the
-			 * copy in memory, so we add it to the swap
-			 * cache. */
-			if (PageSwapCache(page_map)) {
-				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
-			}
-			add_to_swap_cache(page_map, entry);
-			/* We checked we were unlocked way up above, and we
-			   have been careful not to stall until here */
-			set_bit(PG_locked, &page_map->flags);
-			/* OK, do a physical write to swap.  */
-			rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
-		}
-		/* Now we can free the current physical page.  We also
-		 * free up the swap cache if this is the last use of the
-		 * page.  Note that there is a race here: the page may
-		 * still be shared COW by another process, but that
-		 * process may exit while we are writing out the page
-		 * asynchronously.  That's no problem, shrink_mmap() can
-		 * correctly clean up the occassional unshared page
-		 * which gets left behind in the swap cache. */
+	/*
+	 * Is the page already in the swap cache? If so, then
+	 * we can just drop our reference to it without doing
+	 * any IO - it's already up-to-date on disk.
+	 *
+	 * Return 0, as we didn't actually free any real
+	 * memory, and we should just continue our scan.
+	 */
+	if (PageSwapCache(page_map)) {
+		entry = page_map->offset;
+		swap_duplicate(entry);
+		set_pte(page_table, __pte(entry));
+drop_pte:
+		vma->vm_mm->rss--;
+		tsk->nswap++;
+		flush_tlb_page(vma, address);
 		__free_page(page_map);
-		return 1;	/* we slept: the process may not exist any more */
+		return 0;
 	}
 
-	/* The page was _not_ dirty, but still has a zero age.  It must
-	 * already be uptodate on disk.  If it is in the swap cache,
-	 * then we can just unlink the page now.  Remove the swap cache
-	 * too if this is the last user.  */
-        if ((entry = in_swap_cache(page_map)))  {
+	/*
+	 * Is it a clean page? Then it must be recoverable
+	 * by just paging it in again, and we can just drop
+	 * it..
+	 *
+	 * However, this won't actually free any real
+	 * memory, as the page will just be in the page cache
+	 * somewhere, and as such we should just continue
+	 * our scan.
+	 *
+	 * Basically, this just makes it possible for us to do
+	 * some real work in the future in "shrink_mmap()".
+	 */
+	if (!pte_dirty(pte)) {
+		pte_clear(page_table);
+		goto drop_pte;
+	}
+
+	/*
+	 * Ok, it's really dirty. That means that
+	 * we should either create a new swap cache
+	 * entry for it, or we should write it back
+	 * to its own backing store.
+	 *
+	 * Note that in neither case do we actually
+	 * know that we make a page available, but
+	 * as we potentially sleep we can no longer
+	 * continue scanning, so we migth as well
+	 * assume we free'd something.
+	 *
+	 * NOTE NOTE NOTE! This should just set a
+	 * dirty bit in page_map, and just drop the
+	 * pte. All the hard work would be done by
+	 * shrink_mmap().
+	 *
+	 * That would get rid of a lot of problems.
+	 */
+	if (vma->vm_ops && vma->vm_ops->swapout) {
+		pid_t pid = tsk->pid;
 		vma->vm_mm->rss--;
-		flush_cache_page(vma, address);
-		set_pte(page_table, __pte(entry));
-		flush_tlb_page(vma, address);
-		swap_duplicate(entry);
+		if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
+			kill_proc(pid, SIGBUS, 1);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
-	} 
-	/* 
-	 * A clean page to be discarded?  Must be mmap()ed from
-	 * somewhere.  Unlink the pte, and tell the filemap code to
-	 * discard any cached backing page if this is the last user.
-	 */
-	if (PageSwapCache(page_map)) {
-		printk ("VM: How can this page _still_ be cached?");
-		return 0;
+		return 1;
 	}
+
+	/*
+	 * This is a dirty, swappable page.  First of all,
+	 * get a suitable swap entry for it, and make sure
+	 * we have the swap cache set up to associate the
+	 * page with that swap entry.
+	 */
+	entry = get_swap_page();
+	if (!entry)
+		return 0; /* No swap space left */
+		
 	vma->vm_mm->rss--;
+	tsk->nswap++;
 	flush_cache_page(vma, address);
-	pte_clear(page_table);
+	set_pte(page_table, __pte(entry));
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	swap_duplicate(entry);	/* One for the process, one for the swap cache */
+	add_to_swap_cache(page_map, entry);
+	/* We checked we were unlocked way up above, and we
+	   have been careful not to stall until here */
+	set_bit(PG_locked, &page_map->flags);
+
+	/* OK, do a physical asynchronous write to swap.  */
+	rw_swap_page(WRITE, entry, (char *) page, 0);
+
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -409,11 +350,7 @@
 			goto out;
 		}
 
-		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
-		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
+		if (swap_out_process(pbest, gfp_mask))
 			return 1;
 	}
 out:
@@ -441,71 +378,36 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
-
-static int kswapd_free_pages(int kswapd_state)
-{
-	unsigned long end_time;
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
-
-	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
-
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
-			break;
-	} while (time_before_eq(jiffies,end_time));
-	return kswapd_state;
-}
-
 /*
- * The background pageout daemon.
- * Started as a kernel thread from the init process.
+ * The background pageout daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This basically executes once a second, trickling out pages
+ * so that we have _some_ free memory available even if there
+ * is no other activity that frees anything up. This is needed
+ * for things like routing etc, where we otherwise might have
+ * all activity going on in asynchronous contexts that cannot
+ * page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
  */
 int kswapd(void *unused)
 {
 	current->session = 1;
 	current->pgrp = 1;
 	strcpy(current->comm, "kswapd");
-	sigfillset(&current->blocked);
-	
-	/*
-	 *	As a kernel thread we want to tamper with system buffers
-	 *	and other internals and thus be subject to the SMP locking
-	 *	rules. (On a uniprocessor box this does nothing).
-	 */
-	lock_kernel();
 
 	/*
-	 * Set the base priority to something smaller than a
-	 * regular process. We will scale up the priority
-	 * dynamically depending on how much memory we need.
+	 * Hey, if somebody wants to kill us, be our guest. 
+	 * Don't come running to mama if things don't work.
 	 */
-	current->priority = (DEF_PRIORITY * 2) / 3;
-
+	siginitsetinv(&current->blocked, sigmask(SIGKILL));
+	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
-	 * regardless (see "try_to_free_pages()"). "kswapd" should
+	 * regardless (see "__get_free_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
@@ -516,21 +418,23 @@
 	 */
 	current->flags |= PF_MEMALLOC;
 
-	init_swap_timer();
-	kswapd_task = current;
 	while (1) {
-		int state = 0;
-
+		if (signal_pending(current))
+			break;
 		current->state = TASK_INTERRUPTIBLE;
-		flush_signals(current);
 		run_task_queue(&tq_disk);
-		schedule();
-		swapstats.wakeups++;
-		state = kswapd_free_pages(state);
+		schedule_timeout(HZ);
+
+		/*
+		 * kswapd isn't even meant to keep up with anything,
+		 * so just a few pages per second is plenty: the only
+		 * point is to make sure that the system doesn't stay
+		 * forever in a really bad memory squeeze.
+		 */
+		if (nr_free_pages < freepages.high)
+			try_to_free_pages(0, 16);
 	}
-	/* As if we could ever get here - maybe we want to make this killable */
-	kswapd_task = NULL;
-	unlock_kernel();
+
 	return 0;
 }
 
@@ -539,111 +443,42 @@
  * now we need this so that we can do page allocations
  * without holding the kernel lock etc.
  *
- * The "PF_MEMALLOC" flag protects us against recursion:
- * if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
- *
  * We want to try to free "count" pages, and we need to 
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
+	int priority;
 
 	lock_kernel();
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	retval = 1;
-	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
+	priority = 6;
+	do {
+		while (shrink_mmap(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
 
-		current->flags |= PF_MEMALLOC;
+		/* Try to get rid of some shared memory pages.. */
+		while (shm_swap(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
 	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
+		/* Then, try to page stuff out.. */
+		while (swap_out(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
+
+		shrink_dcache_memory(priority, gfp_mask);
+	} while (--priority >= 0);
 done:
-		current->flags &= ~PF_MEMALLOC;
-	}
 	unlock_kernel();
 
-	return retval;
-}
-
-/*
- * Wake up kswapd according to the priority
- *	0 - no wakeup
- *	1 - wake up as a low-priority process
- *	2 - wake up as a normal process
- *	3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
-	if (priority) {
-		p->counter = p->priority << priority;
-		wake_up_process(p);
-	}
-}
-
-/* 
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
-{
-	struct task_struct *p = kswapd_task;
-
-	/*
-	 * Only bother to try to wake kswapd up
-	 * if the task exists and can be woken.
-	 */
-	if (p && (p->state & TASK_INTERRUPTIBLE)) {
-		unsigned int pages;
-		int want_wakeup;
-
-		/*
-		 * Schedule for wakeup if there isn't lots
-		 * of free memory or if there is too much
-		 * of it used for buffers or pgcache.
-		 *
-		 * "want_wakeup" is our priority: 0 means
-		 * not to wake anything up, while 3 means
-		 * that we'd better give kswapd a realtime
-		 * priority.
-		 */
-		want_wakeup = 0;
-		pages = nr_free_pages;
-		if (pages < freepages.high)
-			want_wakeup = 1;
-		if (pages < freepages.low)
-			want_wakeup = 2;
-		if (pages < freepages.min)
-			want_wakeup = 3;
-	
-		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
-}
-
-/* 
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
-	timer_table[SWAP_TIMER].expires = jiffies;
-	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	return priority >= 0;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov