The logic in balance_pgdat() is all bollixed up.

- the incoming arg `nr_pages' should be used to determine if we're being
  asked to free a specific number of pages, not `to_free'.

- local variable `to_free' is not appropriate for the determination of
  whether we failed to bring all zones to appropriate free pages levels.

  Fix this by correctly calculating `all_zones_ok' and then use
  all_zones_ok to determine whether we need to throttle kswapd.

So the logic now is:


	for (increasing priority) {

		all_zones_ok = 1;

		for (all zones) {
			to_reclaim = number of pages to try to reclaim
				     from this zone;
			max_scan = number of pages to scan in this pass
				   (gets larger as `priority' decreases)
			/*
			 * set `reclaimed' to the number of pages which were
			 * actually freed up
			 */
			reclaimed = scan(max_scan pages);
			reclaimed += shrink_slab();

			to_free -= reclaimed;	/* for the `nr_pages>0' case */

			/*
			 * If this scan failed to reclaim `to_reclaim' or more
			 * pages, we're getting into trouble.  Need to scan
			 * some more, and throttle kswapd.   Note that this
			 * zone may now have sufficient free pages due to
			 * freeing activity by some other process.   That's
			 * OK - we'll pick that info up on the next pass
			 * through the loop.
			 */
			if (reclaimed < to_reclaim)
				all_zones_ok = 0;
		}
		if (to_free > 0)
			continue;	/* swsusp: need to do more work */
		if (all_zones_ok)
			break;		/* kswapd is done */
		/*
		 * OK, kswapd is getting into trouble.  Take a nap, then take
		 * another pass across the zones.
		 */
		blk_congestion_wait();
	}


---

 mm/vmscan.c |   32 ++++++++++++++++++++++++--------
 1 files changed, 24 insertions(+), 8 deletions(-)

diff -puN mm/vmscan.c~kswapd-throttling-fixes mm/vmscan.c
--- 25/mm/vmscan.c~kswapd-throttling-fixes	2004-02-28 23:37:55.000000000 -0800
+++ 25-akpm/mm/vmscan.c	2004-02-28 23:37:55.000000000 -0800
@@ -949,40 +949,56 @@ static int balance_pgdat(pg_data_t *pgda
 			int nr_mapped = 0;
 			int max_scan;
 			int to_reclaim;
+			int reclaimed;
 
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
-			if (nr_pages && to_free > 0) {	/* Software suspend */
+			if (nr_pages) {		/* Software suspend */
 				to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
-			} else {			/* Zone balancing */
+			} else {		/* Zone balancing */
 				to_reclaim = zone->pages_high-zone->free_pages;
 				if (to_reclaim <= 0)
 					continue;
 			}
 			zone->temp_priority = priority;
-			all_zones_ok = 0;
 			max_scan = zone->nr_inactive >> priority;
 			if (max_scan < to_reclaim * 2)
 				max_scan = to_reclaim * 2;
 			if (max_scan < SWAP_CLUSTER_MAX)
 				max_scan = SWAP_CLUSTER_MAX;
-			to_free -= shrink_zone(zone, max_scan, GFP_KERNEL,
+			reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
 					to_reclaim, &nr_mapped, ps);
 			if (i < ZONE_HIGHMEM) {
 				reclaim_state->reclaimed_slab = 0;
 				shrink_slab(max_scan + nr_mapped, GFP_KERNEL);
-				to_free -= reclaim_state->reclaimed_slab;
+				reclaimed += reclaim_state->reclaimed_slab;
 			}
+			to_free -= reclaimed;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
 				zone->all_unreclaimable = 1;
+			/*
+			 * If this scan failed to reclaim `to_reclaim' or more
+			 * pages, we're getting into trouble.  Need to scan
+			 * some more, and throttle kswapd.   Note that this zone
+			 * may now have sufficient free pages due to freeing
+			 * activity by some other process.   That's OK - we'll
+			 * pick that info up on the next pass through the loop.
+			 */
+			if (reclaimed < to_reclaim)
+				all_zones_ok = 0;
 		}
+		if (nr_pages && to_free > 0)
+			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
-			break;
-		if (to_free > 0)
-			blk_congestion_wait(WRITE, HZ/10);
+			break;		/* kswapd: all done */
+		/*
+		 * OK, kswapd is getting into trouble.  Take a nap, then take
+		 * another pass across the zones.
+		 */
+		blk_congestion_wait(WRITE, HZ/10);
 	}
 
 	for (i = 0; i < pgdat->nr_zones; i++) {

_