From: Pavel Machek <pavel@ucw.cz>

The following patch is designed to fix a problem in the current implementation
of swsusp in mainline kernels.  Namely, swsusp uses an array of page backup
entries (aka pagedir) to store pointers to memory pages that must be saved
during suspend and restored during resume.

Unfortunately, the pagedir has to be located in a contiguous chunk of memory
and it sometimes turns out that an 8-order or even 9-order allocation is
needed for this purpose.  It sometimes is impossible to get such an allocation
and swsusp may fail during either suspend or resume due to the lack of memory,
although theoretically there is enough free memory for it to succeed.

Moreover, swsusp is more likely to fail for this reason during resume, which
means that it may fail during resume after a successful suspend (this actually
has happened for some people, including me :-)) and this, potentially, may
lead to the loss of data.

The problem is fixed by replacing the pagedir with a linklist so that
high-order memory allocations are avoided (the patches make swsusp use only
0-order allocations).  Unfortunately this means that it's necessary to change
assembly routines used to restore the image after it's been loaded from swap
so that they walk the list instead of walking the array.

This patch makes swsusp allocate only individual pages during resume.  it
contains the necessary changes to the assembly routines etc.  for i386 and
x86-64.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/arch/i386/kernel/asm-offsets.c   |    6 
 25-akpm/arch/i386/power/swsusp.S         |   23 +-
 25-akpm/arch/x86_64/kernel/asm-offsets.c |    2 
 25-akpm/arch/x86_64/kernel/suspend_asm.S |   17 -
 25-akpm/kernel/power/swsusp.c            |  337 +++++++++++++++++++++----------
 5 files changed, 263 insertions(+), 122 deletions(-)

diff -puN arch/i386/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume arch/i386/kernel/asm-offsets.c
--- 25/arch/i386/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume	2005-03-09 16:43:10.000000000 -0800
+++ 25-akpm/arch/i386/kernel/asm-offsets.c	2005-03-09 16:43:10.000000000 -0800
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
+#include <linux/suspend.h>
 #include <asm/ucontext.h>
 #include "sigframe.h"
 #include <asm/fixmap.h>
@@ -56,6 +57,11 @@ void foo(void)
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
 	OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+	BLANK();
+
+	OFFSET(pbe_address, pbe, address);
+	OFFSET(pbe_orig_address, pbe, orig_address);
+	OFFSET(pbe_next, pbe, next);
 
 	/* Offset from the sysenter stack to tss.esp0 */
 	DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
diff -puN arch/i386/power/swsusp.S~swsusp-use-non-contiguous-memory-on-resume arch/i386/power/swsusp.S
--- 25/arch/i386/power/swsusp.S~swsusp-use-non-contiguous-memory-on-resume	2005-03-09 16:43:10.000000000 -0800
+++ 25-akpm/arch/i386/power/swsusp.S	2005-03-09 16:43:10.000000000 -0800
@@ -12,6 +12,7 @@
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/asm_offsets.h>
 
 	.text
 
@@ -28,28 +29,28 @@ ENTRY(swsusp_arch_suspend)
 	ret
 
 ENTRY(swsusp_arch_resume)
-	movl $swsusp_pg_dir-__PAGE_OFFSET,%ecx
-	movl %ecx,%cr3
+	movl	$swsusp_pg_dir-__PAGE_OFFSET, %ecx
+	movl	%ecx, %cr3
 
-	movl	pagedir_nosave, %ebx
-	xorl	%eax, %eax
-	xorl	%edx, %edx
+	movl	pagedir_nosave, %edx
 	.p2align 4,,7
 
 copy_loop:
-	movl	4(%ebx,%edx),%edi
-	movl	(%ebx,%edx),%esi
+	testl	%edx, %edx
+	jz	done
+
+	movl	pbe_address(%edx), %esi
+	movl	pbe_orig_address(%edx), %edi
 
 	movl	$1024, %ecx
 	rep
 	movsl
 
-	incl	%eax
-	addl	$16, %edx
-	cmpl	nr_copy_pages,%eax
-	jb copy_loop
+	movl	pbe_next(%edx), %edx
+	jmp	copy_loop
 	.p2align 4,,7
 
+done:
 	movl saved_context_esp, %esp
 	movl saved_context_ebp, %ebp
 	movl saved_context_ebx, %ebx
diff -puN arch/x86_64/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume arch/x86_64/kernel/asm-offsets.c
--- 25/arch/x86_64/kernel/asm-offsets.c~swsusp-use-non-contiguous-memory-on-resume	2005-03-09 16:43:10.000000000 -0800
+++ 25-akpm/arch/x86_64/kernel/asm-offsets.c	2005-03-09 16:43:10.000000000 -0800
@@ -62,8 +62,8 @@ int main(void)
 	       offsetof (struct rt_sigframe32, uc.uc_mcontext));
 	BLANK();
 #endif
-	DEFINE(SIZEOF_PBE, sizeof(struct pbe));
 	DEFINE(pbe_address, offsetof(struct pbe, address));
 	DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+	DEFINE(pbe_next, offsetof(struct pbe, next));
 	return 0;
 }
diff -puN arch/x86_64/kernel/suspend_asm.S~swsusp-use-non-contiguous-memory-on-resume arch/x86_64/kernel/suspend_asm.S
--- 25/arch/x86_64/kernel/suspend_asm.S~swsusp-use-non-contiguous-memory-on-resume	2005-03-09 16:43:10.000000000 -0800
+++ 25-akpm/arch/x86_64/kernel/suspend_asm.S	2005-03-09 16:43:10.000000000 -0800
@@ -54,16 +54,10 @@ ENTRY(swsusp_arch_resume)
 	movq	%rax, %cr4;  # turn PGE back on
 
 	movq	pagedir_nosave(%rip), %rdx
-	/* compute the limit */
-	movl	nr_copy_pages(%rip), %eax
-	testl	%eax, %eax
-	jz	done
-	movq	%rdx,%r8
-	movl	$SIZEOF_PBE,%r9d
-	mul		%r9  # with rax, clobbers rdx
-	movq 	%r8, %rdx
-	addq	%r8, %rax
 loop:
+	testq	%rdx, %rdx
+	jz	done
+
 	/* get addresses from the pbe and copy the page */
 	movq	pbe_address(%rdx), %rsi
 	movq	pbe_orig_address(%rdx), %rdi
@@ -72,9 +66,8 @@ loop:
 	movsq
 
 	/* progress to the next pbe */
-	addq	$SIZEOF_PBE, %rdx
-	cmpq	%rax, %rdx
-	jb	loop
+	movq	pbe_next(%rdx), %rdx
+	jmp	loop
 done:
 	movl	$24, %eax
 	movl	%eax, %ds
diff -puN kernel/power/swsusp.c~swsusp-use-non-contiguous-memory-on-resume kernel/power/swsusp.c
--- 25/kernel/power/swsusp.c~swsusp-use-non-contiguous-memory-on-resume	2005-03-09 16:43:10.000000000 -0800
+++ 25-akpm/kernel/power/swsusp.c	2005-03-09 16:43:10.000000000 -0800
@@ -621,6 +621,46 @@ static inline void free_pagedir(struct p
 }
 
 /**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+/**
  *	alloc_pagedir - Allocate the page directory.
  *
  *	First, determine exactly how many pages we need and
@@ -636,7 +676,7 @@ static inline void free_pagedir(struct p
 static struct pbe * alloc_pagedir(unsigned nr_pages)
 {
 	unsigned num;
-	struct pbe *pblist, *pbe, *p;
+	struct pbe *pblist, *pbe;
 
 	if (!nr_pages)
 		return NULL;
@@ -645,21 +685,13 @@ static struct pbe * alloc_pagedir(unsign
 	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
-		p = pbe;
 		pbe += PB_PAGE_SKIP;
-		do
-			p->next = p + 1;
-		while (p++ < pbe);
 		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
 	}
-	if (pbe) {
-		for (num -= PBES_PER_PAGE - 1, p = pbe; num < nr_pages; p++, num++)
-			p->next = p + 1;
-	} else { /* get_zeroed_page() failed */
+	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
 		pblist = NULL;
         }
-	pr_debug("alloc_pagedir(): allocated %d PBEs\n", num);
 	return pblist;
 }
 
@@ -766,6 +798,7 @@ static int swsusp_alloc(void)
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return -ENOMEM;
 	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
 	pagedir_nosave = pagedir_save;
 	if ((error = alloc_image_pages())) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -917,44 +950,104 @@ static int __init does_collide_order(uns
 	return 0;
 }
 
-/*
- * We check here that pagedir & pages it points to won't collide with pages
- * where we're going to restore from the loaded pages later
+/**
+ *	On resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	which had been used before suspend.
+ *
+ *	We don't know which pages are usable until we allocate them.
+ *
+ *	Allocated but unusable (ie eaten) memory pages are linked together
+ *	to create a list, so that we can free them easily
+ *
+ *	We could have used a type other than (void *)
+ *	for this purpose, but ...
  */
-static int __init check_pagedir(void)
+static void **eaten_memory = NULL;
+
+static inline void eat_page(void *page)
 {
-	int i;
+	void **c;
 
-	for(i=0; i < nr_copy_pages; i++) {
-		unsigned long addr;
+	c = eaten_memory;
+	eaten_memory = page;
+	*eaten_memory = c;
+}
 
-		do {
-			addr = get_zeroed_page(GFP_ATOMIC);
-			if(!addr)
-				return -ENOMEM;
-		} while (does_collide_order(addr, 0));
+static unsigned long __init get_usable_page(unsigned gfp_mask)
+{
+	unsigned long m;
 
-		(pagedir_nosave+i)->address = addr;
+	m = get_zeroed_page(gfp_mask);
+	while (does_collide_order(m, 0)) {
+		eat_page((void *)m);
+		m = get_zeroed_page(gfp_mask);
+		if (!m)
+			break;
 	}
-	return 0;
+	return m;
 }
 
-static int __init swsusp_pagedir_relocate(void)
+static void __init free_eaten_memory(void)
 {
-	/*
-	 * We have to avoid recursion (not to overflow kernel stack),
-	 * and that's why code looks pretty cryptic 
+	unsigned long m;
+	void **c;
+	int i = 0;
+
+	c = eaten_memory;
+	while (c) {
+		m = (unsigned long)c;
+		c = *c;
+		free_page(m);
+		i++;
+	}
+	eaten_memory = NULL;
+	pr_debug("swsusp: %d unused pages freed\n", i);
+}
+
+/**
+ *	check_pagedir - We ensure here that pages that the PBEs point to
+ *	won't collide with pages where we're going to restore from the loaded
+ *	pages later
+ */
+
+static int __init check_pagedir(struct pbe *pblist)
+{
+	struct pbe *p;
+
+	/* This is necessary, so that we can free allocated pages
+	 * in case of failure
 	 */
-	suspend_pagedir_t *old_pagedir = pagedir_nosave;
-	void **eaten_memory = NULL;
-	void **c = eaten_memory, *m, *f;
-	int ret = 0;
+	for_each_pbe (p, pblist)
+		p->address = 0UL;
+
+	for_each_pbe (p, pblist) {
+		p->address = get_usable_page(GFP_ATOMIC);
+		if (!p->address)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ *	swsusp_pagedir_relocate - It is possible, that some memory pages
+ *	occupied by the list of PBEs collide with pages where we're going to
+ *	restore from the loaded pages later.  We relocate them here.
+ */
+
+static struct pbe * __init swsusp_pagedir_relocate(struct pbe *pblist)
+{
 	struct zone *zone;
-	int i;
-	struct pbe *p;
 	unsigned long zone_pfn;
+	struct pbe *pbpage, *tail, *p;
+	void *m;
+	int rel = 0, error = 0;
+
+	if (!pblist) /* a sanity check */
+		return NULL;
 
-	printk("Relocating pagedir ");
+	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
+			swsusp_info.pagedir_pages);
 
 	/* Set page flags */
 
@@ -964,45 +1057,52 @@ static int __init swsusp_pagedir_relocat
 					zone->zone_start_pfn));
 	}
 
-	/* Clear orig address */
+	/* Clear orig addresses */
 
-	for(i = 0, p = pagedir_nosave; i < nr_copy_pages; i++, p++) {
+	for_each_pbe (p, pblist)
 		ClearPageNosaveFree(virt_to_page(p->orig_address));
-	}
 
-	if (!does_collide_order((unsigned long)old_pagedir, pagedir_order)) {
-		printk("not necessary\n");
-		return check_pagedir();
-	}
+	tail = pblist + PB_PAGE_SKIP;
 
-	while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) {
-		if (!does_collide_order((unsigned long)m, pagedir_order))
-			break;
-		eaten_memory = m;
-		printk( "." ); 
-		*eaten_memory = c;
-		c = eaten_memory;
-	}
+	/* Relocate colliding pages */
 
-	if (!m) {
-		printk("out of memory\n");
-		ret = -ENOMEM;
-	} else {
-		pagedir_nosave =
-			memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order);
+	for_each_pb_page (pbpage, pblist) {
+		if (does_collide_order((unsigned long)pbpage, 0)) {
+			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m) {
+				error = -ENOMEM;
+				break;
+			}
+			memcpy(m, (void *)pbpage, PAGE_SIZE);
+			if (pbpage == pblist)
+				pblist = (struct pbe *)m;
+			else
+				tail->next = (struct pbe *)m;
+
+			eat_page((void *)pbpage);
+			pbpage = (struct pbe *)m;
+
+			/* We have to link the PBEs again */
+
+			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
+				if (p->next) /* needed to save the end */
+					p->next = p + 1;
+
+			rel++;
+		}
+		tail = pbpage + PB_PAGE_SKIP;
 	}
 
-	c = eaten_memory;
-	while (c) {
-		printk(":");
-		f = c;
-		c = *c;
-		free_pages((unsigned long)f, pagedir_order);
+	if (error) {
+		printk("\nswsusp: Out of memory\n\n");
+		free_pagedir(pblist);
+		free_eaten_memory();
+		pblist = NULL;
 	}
-	if (ret)
-		return ret;
-	printk("|\n");
-	return check_pagedir();
+	else
+		printk("swsusp: Relocated %d pages\n", rel);
+
+	return pblist;
 }
 
 /**
@@ -1147,76 +1247,117 @@ static int __init check_sig(void)
 }
 
 /**
- *	swsusp_read_data - Read image pages from swap.
+ *	data_read - Read image pages from swap.
  *
  *	You do not need to check for overlaps, check_pagedir()
  *	already did that.
  */
 
-static int __init data_read(void)
+static int __init data_read(struct pbe *pblist)
 {
 	struct pbe * p;
-	int error;
-	int i;
-	int mod = nr_copy_pages / 100;
+	int error = 0;
+	int i = 0;
+	int mod = swsusp_info.image_pages / 100;
 
 	if (!mod)
 		mod = 1;
 
-	if ((error = swsusp_pagedir_relocate()))
-		return error;
+	printk("swsusp: Reading image data (%lu pages):     ",
+			swsusp_info.image_pages);
+
+	for_each_pbe (p, pblist) {
+		if (!(i % mod))
+			printk("\b\b\b\b%3d%%", i / mod);
 
-	printk( "Reading image data (%d pages):     ", nr_copy_pages );
-	for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
-		if (!(i%mod))
-			printk( "\b\b\b\b%3d%%", i / mod );
 		error = bio_read_page(swp_offset(p->swap_address),
 				  (void *)p->address);
+		if (error)
+			return error;
+
+		i++;
 	}
-	printk(" %d done.\n",i);
+	printk("\b\b\b\bdone\n");
 	return error;
-
 }
 
 extern dev_t __init name_to_dev_t(const char *line);
 
-static int __init read_pagedir(void)
+/**
+ *	read_pagedir - Read page backup list pages from swap
+ */
+
+static int __init read_pagedir(struct pbe *pblist)
 {
-	unsigned long addr;
-	int i, n = swsusp_info.pagedir_pages;
-	int error = 0;
+	struct pbe *pbpage, *p;
+	unsigned i = 0;
+	int error;
 
-	addr = __get_free_pages(GFP_ATOMIC, pagedir_order);
-	if (!addr)
-		return -ENOMEM;
-	pagedir_nosave = (struct pbe *)addr;
+	if (!pblist)
+		return -EFAULT;
+
+	printk("swsusp: Reading pagedir (%lu pages)\n",
+			swsusp_info.pagedir_pages);
 
-	pr_debug("swsusp: Reading pagedir (%d Pages)\n",n);
+	for_each_pb_page (pbpage, pblist) {
+		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
 
-	for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
-		unsigned long offset = swp_offset(swsusp_info.pagedir[i]);
-		if (offset)
-			error = bio_read_page(offset, (void *)addr);
-		else
-			error = -EFAULT;
+		error = -EFAULT;
+		if (offset) {
+			p = (pbpage + PB_PAGE_SKIP)->next;
+			error = bio_read_page(offset, (void *)pbpage);
+			(pbpage + PB_PAGE_SKIP)->next = p;
+		}
+		if (error)
+			break;
 	}
+
 	if (error)
-		free_pages((unsigned long)pagedir_nosave, pagedir_order);
+		free_page((unsigned long)pblist);
+
+	BUG_ON(i != swsusp_info.pagedir_pages);
+
 	return error;
 }
 
+
 static int __init read_suspend_image(void)
 {
 	int error = 0;
+	struct pbe *p;
 
 	if ((error = check_sig()))
 		return error;
+
 	if ((error = check_header()))
 		return error;
-	if ((error = read_pagedir()))
+
+	if (!(p = alloc_pagedir(nr_copy_pages)))
+		return -ENOMEM;
+
+	if ((error = read_pagedir(p)))
 		return error;
-	if ((error = data_read()))
-		free_pages((unsigned long)pagedir_nosave, pagedir_order);
+
+	create_pbe_list(p, nr_copy_pages);
+
+	if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+		return -ENOMEM;
+
+	/* Allocate memory for the image and read the data from swap */
+
+	error = check_pagedir(pagedir_nosave);
+	free_eaten_memory();
+	if (!error)
+		error = data_read(pagedir_nosave);
+
+	if (error) { /* We fail cleanly */
+		for_each_pbe (p, pagedir_nosave)
+			if (p->address) {
+				free_page(p->address);
+				p->address = 0UL;
+			}
+		free_pagedir(pagedir_nosave);
+	}
 	return error;
 }
 
_