The kernel presently tends to reclaim from highmem much more than from
lowmem.  A 1.5G machine running 2.6.3 will reclaim 700 highmem pagecache
pages for one lowmem one.  With Nick's patches this comes down to 10:1.

This patch fixes it up, by introducing a pre-calculated zone->reclaim_batch,
which tells the VM how many pages to reclaim from this zone.  It is scaled
according to the number of pages in the zone.

With this in place, the reclaim rate for each zone is indeed proportional to
the size of the zone.

Interaction with the `incremental min' logic in the page allocator is
important.  We teach kswapd to perform reclaim against the normal zone even
if the normal zone has free_pages > pages_high.  This way, more pages get
freed from the normal zone, bringing its pages_free up to the level at which
the normal zone is again eligible for __GFP_HIGHMEM allocations.  This keeps
the inter-zone allocation rates balanced.
DESC
fix all_zones_ok logic
EDESC
From: Nick Piggin <piggin@cyberone.com.au>




---

 include/linux/mmzone.h |    5 ++++-
 mm/page_alloc.c        |   11 +++++++++++
 mm/vmscan.c            |   24 +++++++++++-------------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff -puN mm/page_alloc.c~zone-balancing-batching mm/page_alloc.c
--- 25/mm/page_alloc.c~zone-balancing-batching	2004-02-25 03:37:18.000000000 -0800
+++ 25-akpm/mm/page_alloc.c	2004-02-25 03:37:18.000000000 -0800
@@ -1019,6 +1019,7 @@ void show_free_areas(void)
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
+			" batch:%lukB"
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
@@ -1028,6 +1029,7 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
+			K(zone->reclaim_batch),
 			K(zone->nr_active),
 			K(zone->nr_inactive),
 			K(zone->present_pages)
@@ -1622,6 +1624,8 @@ static void setup_per_zone_pages_min(voi
 			lowmem_pages += zone->present_pages;
 
 	for_each_zone(zone) {
+		unsigned long long reclaim_batch;
+
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (is_highmem(zone)) {
 			/*
@@ -1648,6 +1652,13 @@ static void setup_per_zone_pages_min(voi
 
 		zone->pages_low = zone->pages_min * 2;
 		zone->pages_high = zone->pages_min * 3;
+
+		reclaim_batch = zone->present_pages * SWAP_CLUSTER_MAX;
+		do_div(reclaim_batch, lowmem_pages);
+		zone->reclaim_batch = reclaim_batch;
+		if (zone->reclaim_batch < 4)
+			zone->reclaim_batch = 4;
+
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }
diff -puN mm/vmscan.c~zone-balancing-batching mm/vmscan.c
--- 25/mm/vmscan.c~zone-balancing-batching	2004-02-25 03:37:18.000000000 -0800
+++ 25-akpm/mm/vmscan.c	2004-02-25 03:37:18.000000000 -0800
@@ -861,13 +861,12 @@ shrink_zone(struct zone *zone, unsigned 
  */
 static int
 shrink_caches(struct zone **zones, int priority, int *total_scanned,
-		int gfp_mask, int nr_pages, struct page_state *ps)
+		int gfp_mask, struct page_state *ps)
 {
 	int ret = 0;
 	int i;
 
 	for (i = 0; zones[i] != NULL; i++) {
-		int to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX);
 		struct zone *zone = zones[i];
 		int nr_scanned;
 
@@ -877,8 +876,8 @@ shrink_caches(struct zone **zones, int p
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		ret += shrink_zone(zone, gfp_mask,
-				to_reclaim, &nr_scanned, ps, priority);
+		ret += shrink_zone(zone, gfp_mask, zone->reclaim_batch,
+				&nr_scanned, ps, priority);
 		*total_scanned += nr_scanned;
 	}
 	return ret;
@@ -906,7 +905,6 @@ int try_to_free_pages(struct zone **zone
 {
 	int priority;
 	int ret = 0;
-	const int nr_pages = SWAP_CLUSTER_MAX;
 	int nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	int i;
@@ -922,7 +920,7 @@ int try_to_free_pages(struct zone **zone
 
 		get_page_state(&ps);
 		nr_reclaimed += shrink_caches(zones, priority, &total_scanned,
-						gfp_mask, nr_pages, &ps);
+						gfp_mask, &ps);
 
 		shrink_slab(total_scanned, gfp_mask);
 		if (reclaim_state) {
@@ -930,7 +928,7 @@ int try_to_free_pages(struct zone **zone
 			reclaim_state->reclaimed_slab = 0;
 		}
 
-		if (nr_reclaimed >= nr_pages) {
+		if (nr_reclaimed >= SWAP_CLUSTER_MAX) {
 			ret = 1;
 			if (gfp_mask & __GFP_FS)
 				wakeup_bdflush(total_scanned);
@@ -1010,14 +1008,14 @@ static int balance_pgdat(pg_data_t *pgda
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
-			if (nr_pages) {		/* Software suspend */
+			if (nr_pages)		/* Software suspend */
 				to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
-			} else {		/* Zone balancing */
-				to_reclaim = zone->pages_high-zone->free_pages;
-				if (to_reclaim <= 0)
-					continue;
+			else {			/* Zone balancing */
+				to_reclaim = zone->reclaim_batch;
+				if (zone->pages_high < zone->free_pages)
+					all_zones_ok = 0;
 			}
-			all_zones_ok = 0;
+
 			zone->temp_priority = priority;
 			reclaimed = shrink_zone(zone, GFP_KERNEL,
 					to_reclaim, &nr_scanned, ps, priority);
diff -puN include/linux/mmzone.h~zone-balancing-batching include/linux/mmzone.h
--- 25/include/linux/mmzone.h~zone-balancing-batching	2004-02-25 03:37:18.000000000 -0800
+++ 25-akpm/include/linux/mmzone.h	2004-02-25 03:37:18.000000000 -0800
@@ -69,7 +69,10 @@ struct zone {
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
+	unsigned long		pages_min;
+	unsigned long		pages_low;
+	unsigned long		pages_high;
+	unsigned long		reclaim_batch;
 
 	ZONE_PADDING(_pad1_)
 

_