From: Alexander Nyberg <alexn@dsv.su.se>

Introduces CONFIG_PAGE_OWNER that keeps track of the call chain under which a
page was allocated.  Includes a user-space helper in
Documentation/page_owner.c to sort the enormous amount of output that this may
give (thanks tridge).

Information available through /proc/page_owner

x86_64 introduces some stack noise in certain call chains so for exact
output use of x86 && CONFIG_FRAME_POINTER is suggested.  Tested on x86, x86
&& CONFIG_FRAME_POINTER, x86_64

Output looks like:

4819 times:
Page allocated via order 0, mask 0x50
[0xc012b7b9] find_lock_page+25
[0xc012b8c8] find_or_create_page+152
[0xc0147d74] grow_dev_page+36
[0xc0148164] __find_get_block+84
[0xc0147ebc] __getblk_slow+124
[0xc0148164] __find_get_block+84
[0xc01481e7] __getblk+55
[0xc0185d14] do_readahead+100

We use a custom stack unwinder because using __builtin_return_address([0-7])
causes gcc to generate code that might try to unwind the stack looking for
function return addresses and "fall off" causing early panics if the call
chain is not deep enough.  So in that case we could have had a depth of around
3 functions in all traces (I experimented a bit with this).


From: Dave Hansen <haveblue@us.ibm.com>

	make page_owner handle non-contiguous page ranges

From: Alexander Nyberg <alexn@telia.com>

I've cleaned up the __alloc_pages() part to a simple set_page_owner() call.

Signed-off-by: Alexander Nyberg <alexn@dsv.su.se>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 Documentation/page_owner.c |  140 +++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c        |   67 +++++++++++++++++++++
 include/linux/mm.h         |    5 +
 lib/Kconfig.debug          |   10 +++
 mm/page_alloc.c            |   62 +++++++++++++++++++
 5 files changed, 284 insertions(+)

diff -puN /dev/null Documentation/page_owner.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ devel-akpm/Documentation/page_owner.c	2005-09-07 20:10:59.000000000 -0700
@@ -0,0 +1,140 @@
+/*
+ * User-space helper to sort the output of /proc/page_owner
+ *
+ * Example use:
+ * cat /proc/page_owner > page_owner.txt
+ * ./sort page_owner.txt sorted_page_owner.txt
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+
+struct block_list {
+	char *txt;
+	int len;
+	int num;
+};
+
+
+static struct block_list *list;
+static int list_size;
+static int max_size;
+
+struct block_list *block_head;
+
+int read_block(char *buf, FILE *fin)
+{
+	int ret = 0;
+	int hit = 0;
+	char *curr = buf;
+
+	for (;;) {
+		*curr = getc(fin);
+		if (*curr == EOF) return -1;
+
+		ret++;
+		if (*curr == '\n' && hit == 1)
+			return ret - 1;
+		else if (*curr == '\n')
+			hit = 1;
+		else
+			hit = 0;
+		curr++;
+	}
+}
+
+static int compare_txt(struct block_list *l1, struct block_list *l2)
+{
+	return strcmp(l1->txt, l2->txt);
+}
+
+static int compare_num(struct block_list *l1, struct block_list *l2)
+{
+	return l2->num - l1->num;
+}
+
+static void add_list(char *buf, int len)
+{
+	if (list_size != 0 &&
+	    len == list[list_size-1].len &&
+	    memcmp(buf, list[list_size-1].txt, len) == 0) {
+		list[list_size-1].num++;
+		return;
+	}
+	if (list_size == max_size) {
+		printf("max_size too small??\n");
+		exit(1);
+	}
+	list[list_size].txt = malloc(len+1);
+	list[list_size].len = len;
+	list[list_size].num = 1;
+	memcpy(list[list_size].txt, buf, len);
+	list[list_size].txt[len] = 0;
+	list_size++;
+	if (list_size % 1000 == 0) {
+		printf("loaded %d\r", list_size);
+		fflush(stdout);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	FILE *fin, *fout;
+	char buf[1024];
+	int ret, i, count;
+	struct block_list *list2;
+	struct stat st;
+
+	fin = fopen(argv[1], "r");
+	fout = fopen(argv[2], "w");
+	if (!fin || !fout) {
+		printf("Usage: ./program <input> <output>\n");
+		perror("open: ");
+		exit(2);
+	}
+
+	fstat(fileno(fin), &st);
+	max_size = st.st_size / 100; /* hack ... */
+
+	list = malloc(max_size * sizeof(*list));
+
+	for(;;) {
+		ret = read_block(buf, fin);
+		if (ret < 0)
+			break;
+
+		buf[ret] = '\0';
+		add_list(buf, ret);
+	}
+
+	printf("loaded %d\n", list_size);
+
+	printf("sorting ....\n");
+
+	qsort(list, list_size, sizeof(list[0]), compare_txt);
+
+	list2 = malloc(sizeof(*list) * list_size);
+
+	printf("culling\n");
+
+	for (i=count=0;i<list_size;i++) {
+		if (count == 0 ||
+		    strcmp(list2[count-1].txt, list[i].txt) != 0) {
+			list2[count++] = list[i];
+		} else {
+			list2[count-1].num += list[i].num;
+		}
+	}
+
+	qsort(list2, count, sizeof(list[0]), compare_num);
+
+	for (i=0;i<count;i++) {
+		fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt);
+	}
+	return 0;
+}
diff -puN fs/proc/proc_misc.c~page-owner-tracking-leak-detector fs/proc/proc_misc.c
--- devel/fs/proc/proc_misc.c~page-owner-tracking-leak-detector	2005-09-07 20:10:59.000000000 -0700
+++ devel-akpm/fs/proc/proc_misc.c	2005-09-07 20:10:59.000000000 -0700
@@ -703,6 +703,66 @@ static struct file_operations proc_sysrq
 };
 #endif
 
+#ifdef CONFIG_PAGE_OWNER
+#include <linux/bootmem.h>
+#include <linux/kallsyms.h>
+static ssize_t
+read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long pfn;
+	struct page *page;
+	char *kbuf, *modname;
+	const char *symname;
+	int ret = 0, next_idx = 1;
+	char namebuf[128];
+	unsigned long offset = 0, symsize;
+	int i;
+
+	pfn = min_low_pfn + *ppos;
+	page = pfn_to_page(pfn);
+	for (; pfn < max_pfn; pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		if (page->order >= 0)
+			break;
+		next_idx++;
+	}
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	*ppos += next_idx;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	ret = snprintf(kbuf, 1024, "Page allocated via order %d, mask 0x%x\n",
+			page->order, page->gfp_mask);
+
+	for (i = 0; i < 8; i++) {
+		if (!page->trace[i])
+			break;
+		symname = kallsyms_lookup(page->trace[i], &symsize, &offset, &modname, namebuf);
+		ret += snprintf(kbuf + ret, count - ret, "[0x%lx] %s+%lu\n",
+			page->trace[i], namebuf, offset);
+	}
+
+	ret += snprintf(kbuf + ret, count -ret, "\n");
+
+	if (copy_to_user(buf, kbuf, ret))
+		ret = -EFAULT;
+
+	kfree(kbuf);
+	return ret;
+}
+
+static struct file_operations proc_page_owner_operations = {
+	.read		= read_page_owner,
+};
+#endif
+
 struct proc_dir_entry *proc_root_kcore;
 
 void create_seq_entry(char *name, mode_t mode, struct file_operations *f)
@@ -787,4 +847,11 @@ void __init proc_misc_init(void)
 			entry->proc_fops = &ppc_htab_operations;
 	}
 #endif
+#ifdef CONFIG_PAGE_OWNER
+	entry = create_proc_entry("page_owner", S_IWUSR | S_IRUGO, NULL);
+	if (entry) {
+		entry->proc_fops = &proc_page_owner_operations;
+		entry->size = 1024;
+	}
+#endif
 }
diff -puN include/linux/mm.h~page-owner-tracking-leak-detector include/linux/mm.h
--- devel/include/linux/mm.h~page-owner-tracking-leak-detector	2005-09-07 20:10:59.000000000 -0700
+++ devel-akpm/include/linux/mm.h	2005-09-07 20:10:59.000000000 -0700
@@ -257,6 +257,11 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_PAGE_OWNER
+	int order;
+	unsigned int gfp_mask;
+	unsigned long trace[8];
+#endif
 };
 
 /*
diff -puN lib/Kconfig.debug~page-owner-tracking-leak-detector lib/Kconfig.debug
--- devel/lib/Kconfig.debug~page-owner-tracking-leak-detector	2005-09-07 20:10:59.000000000 -0700
+++ devel-akpm/lib/Kconfig.debug	2005-09-07 20:10:59.000000000 -0700
@@ -161,6 +161,16 @@ config DEBUG_IOREMAP
 	  automatically, but we'd like to make it more efficient by not
 	  having to do that.
 
+config PAGE_OWNER
+	bool "Track page owner"
+	depends on DEBUG_KERNEL && X86
+	help
+	  This keeps track of what call chain is the owner of a page, may
+	  help to find bare alloc_page(s) leaks. Eats a fair amount of memory.
+	  See Documentation/page_owner.c for user-space helper.
+
+	  If unsure, say N.
+
 config DEBUG_FS
 	bool "Debug Filesystem"
 	depends on DEBUG_KERNEL && SYSFS
diff -puN mm/page_alloc.c~page-owner-tracking-leak-detector mm/page_alloc.c
--- devel/mm/page_alloc.c~page-owner-tracking-leak-detector	2005-09-07 20:10:59.000000000 -0700
+++ devel-akpm/mm/page_alloc.c	2005-09-07 20:10:59.000000000 -0700
@@ -796,6 +796,58 @@ should_reclaim_zone(struct zone *z, unsi
 	return 1;
 }
 
+#ifdef CONFIG_PAGE_OWNER
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+	return	p > (void *)tinfo &&
+		p < (void *)tinfo + THREAD_SIZE - 3;
+}
+
+static inline void __stack_trace(struct page *page, unsigned long *stack,
+			unsigned long bp)
+{
+	int i = 0;
+	unsigned long addr;
+	struct thread_info *tinfo = (struct thread_info *)
+		((unsigned long)stack & (~(THREAD_SIZE - 1)));
+
+	memset(page->trace, 0, sizeof(long) * 8);
+
+#ifdef CONFIG_FRAME_POINTER
+	while (valid_stack_ptr(tinfo, (void *)bp)) {
+		addr = *(unsigned long *)(bp + sizeof(long));
+		page->trace[i] = addr;
+		if (++i >= 8)
+			break;
+		bp = *(unsigned long *)bp;
+	}
+#else
+	while (valid_stack_ptr(tinfo, stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			page->trace[i] = addr;
+			if (++i >= 8)
+				break;
+		}
+	}
+#endif
+}
+
+static inline void set_page_owner(struct page *page,
+			unsigned int order, unsigned int gfp_mask)
+{
+	unsigned long address, bp;
+#ifdef X86_64
+	asm ("movq %%rbp, %0" : "=r" (bp) : );
+#else
+	asm ("movl %%ebp, %0" : "=r" (bp) : );
+#endif
+	page->order = (int) order;
+	page->gfp_mask = gfp_mask;
+	__stack_trace(page, &address, bp);
+}
+#endif /* CONFIG_PAGE_OWNER */
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -994,6 +1046,10 @@ nopage:
 	}
 	return NULL;
 got_pg:
+
+#ifdef CONFIG_PAGE_OWNER
+	set_page_owner(page, order, gfp_mask);
+#endif
 	zone_statistics(zonelist, z);
 	return page;
 }
@@ -1047,6 +1103,9 @@ fastcall void __free_pages(struct page *
 			free_hot_page(page);
 		else
 			__free_pages_ok(page, order);
+#ifdef CONFIG_PAGE_OWNER
+		page->order = -1;
+#endif
 	}
 }
 
@@ -1716,6 +1775,9 @@ void __devinit memmap_init_zone(unsigned
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
+#ifdef CONFIG_PAGE_OWNER
+		page->order = -1;
+#endif
 	}
 }
 
_