From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

The issue of exceedingly large hash tables has been discussed on the
mailing list a while back, but seems to slip through the cracks.

What we found is it's not a problem for x86 (and most other
architectures) because __get_free_pages won't be able to get anything
beyond order MAX_ORDER-1 (10) which means at most those hash tables are
4MB each (assume 4K page size).  However, on ia64, in order to support
larger hugeTLB page size, the MAX_ORDER is bumped up to 18, which now
means a 2GB upper limits enforced by the page allocator (assume 16K page
size).  PPC64 is another example that bumps up MAX_ORDER.

Last time I checked, the tcp ehash table is taking a whooping (insane!)
2GB on one of our large machine.  dentry and inode hash tables also take
considerable amount of memory.

We enforce the maximum size based on the number of entries instead of the
page order.  The upper bound is capped at 2M.  All numbers on x86 remain the
same as we don't want to disturb already established and working number.

The left shift of (mempages << PAGE_SHIFT) will overflow on x86 with 4GB or
more physical memory.  The code which handles different shift directions is
to accommodate different page size from 4K to 64K.


---

 fs/dcache.c      |    9 +++++----
 fs/inode.c       |    7 +++++--
 net/ipv4/route.c |    2 +-
 net/ipv4/tcp.c   |    2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff -puN fs/dcache.c~limit-hash-table-sizes fs/dcache.c
--- 25/fs/dcache.c~limit-hash-table-sizes	2004-01-21 00:08:11.000000000 -0800
+++ 25-akpm/fs/dcache.c	2004-01-21 00:08:11.000000000 -0800
@@ -49,6 +49,7 @@ static kmem_cache_t *dentry_cache; 
  */
 #define D_HASHBITS     d_hash_shift
 #define D_HASHMASK     d_hash_mask
+#define D_HASHMAX	(2*1024*1024UL)	/* max number of entries */
 
 static unsigned int d_hash_mask;
 static unsigned int d_hash_shift;
@@ -1552,10 +1553,10 @@ static void __init dcache_init(unsigned 
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
-#if PAGE_SHIFT < 13
-	mempages >>= (13 - PAGE_SHIFT);
-#endif
-	mempages *= sizeof(struct hlist_head);
+	mempages = PAGE_SHIFT < 13 ?
+		   mempages >> (13 - PAGE_SHIFT) :
+		   mempages << (PAGE_SHIFT - 13);
+	mempages = min(D_HASHMAX, mempages) * sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
 
diff -puN fs/inode.c~limit-hash-table-sizes fs/inode.c
--- 25/fs/inode.c~limit-hash-table-sizes	2004-01-21 00:08:11.000000000 -0800
+++ 25-akpm/fs/inode.c	2004-01-21 00:08:11.000000000 -0800
@@ -53,6 +53,7 @@
  */
 #define I_HASHBITS	i_hash_shift
 #define I_HASHMASK	i_hash_mask
+#define I_HASHMAX	(2*1024*1024UL)	/* max number of entries */
 
 static unsigned int i_hash_mask;
 static unsigned int i_hash_shift;
@@ -1325,8 +1326,10 @@ void __init inode_init(unsigned long mem
 	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
 		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
 
-	mempages >>= (14 - PAGE_SHIFT);
-	mempages *= sizeof(struct hlist_head);
+	mempages = PAGE_SHIFT < 14 ?
+		   mempages >> (14 - PAGE_SHIFT) :
+		   mempages << (PAGE_SHIFT - 14);
+	mempages = min(I_HASHMAX, mempages) * sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
 
diff -puN net/ipv4/route.c~limit-hash-table-sizes net/ipv4/route.c
--- 25/net/ipv4/route.c~limit-hash-table-sizes	2004-01-21 00:08:11.000000000 -0800
+++ 25-akpm/net/ipv4/route.c	2004-01-21 00:08:23.000000000 -0800
@@ -2744,7 +2744,7 @@ int __init ip_rt_init(void)
 
 	goal = num_physpages >> (26 - PAGE_SHIFT);
 
-	for (order = 0; (1UL << order) < goal; order++)
+	for (order = 0; (order < 10) && ((1UL << order) < goal); order++)
 		/* NOTHING */;
 
 	do {
diff -puN net/ipv4/tcp.c~limit-hash-table-sizes net/ipv4/tcp.c
--- 25/net/ipv4/tcp.c~limit-hash-table-sizes	2004-01-21 00:08:11.000000000 -0800
+++ 25-akpm/net/ipv4/tcp.c	2004-01-21 00:08:23.000000000 -0800
@@ -2610,7 +2610,7 @@ void __init tcp_init(void)
 	else
 		goal = num_physpages >> (23 - PAGE_SHIFT);
 
-	for (order = 0; (1UL << order) < goal; order++)
+	for (order = 0; (order < 10) && ((1UL << order) < goal); order++)
 		;
 	do {
 		tcp_ehash_size = (1UL << order) * PAGE_SIZE /

_