patch-2.4.21 linux-2.4.21/arch/x86_64/lib/csum-copy.S
Next file: linux-2.4.21/arch/x86_64/lib/csum-partial.c
Previous file: linux-2.4.21/arch/x86_64/lib/copy_user.S
Back to the patch index
Back to the overall index
- Lines: 401
- Date:
2003-06-13 07:51:32.000000000 -0700
- Orig file:
linux-2.4.20/arch/x86_64/lib/csum-copy.S
- Orig date:
2002-11-28 15:53:12.000000000 -0800
diff -urN linux-2.4.20/arch/x86_64/lib/csum-copy.S linux-2.4.21/arch/x86_64/lib/csum-copy.S
@@ -1,5 +1,5 @@
/*
- * Copyright 2002 Andi Kleen
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
@@ -8,7 +8,6 @@
#include <linux/linkage.h>
#include <asm/errno.h>
-// #define FIX_ALIGNMENT 1
/*
* Checksum copy with exception handling.
* On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
@@ -26,17 +25,14 @@
* eax 64bit sum. undefined in case of exception.
*
* Wrappers need to take care of valid exception sum and zeroing.
+ * They also should align source or destination to 8 bytes.
*/
-/* for now - should vary this based on direction */
- #define prefetch prefetcht2
- #define movnti movq
-
.macro source
10:
.section __ex_table,"a"
.align 8
- .quad 10b,bad_source
+ .quad 10b,.Lbad_source
.previous
.endm
@@ -44,57 +40,74 @@
20:
.section __ex_table,"a"
.align 8
- .quad 20b,bad_dest
+ .quad 20b,.Lbad_dest
.previous
.endm
+ .macro ignore L=.Lignore
+30:
+ .section __ex_table,"a"
+ .align 8
+ .quad 30b,\L
+ .previous
+ .endm
+
+
.globl csum_partial_copy_generic
- .p2align
+ .p2align 4
csum_partial_copy_generic:
- prefetchnta (%rdi)
+ cmpl $3*64,%edx
+ jle .Lignore
+
+ ignore
+ prefetch (%rdi)
+ ignore
+ prefetch 1*64(%rdi)
+ ignore
+ prefetch 2*64(%rdi)
+ ignore
+ prefetch 3*64(%rdi)
+ ignore
+ prefetch 4*64(%rdi)
+ ignore
+ prefetchw (%rsi)
+ ignore
+ prefetchw 1*64(%rsi)
+ ignore
+ prefetchw 2*64(%rsi)
+ ignore
+ prefetchw 3*64(%rsi)
+ ignore
+ prefetchw 4*64(%rsi)
+
+.Lignore:
+ subq $7*8,%rsp
+ movq %rbx,2*8(%rsp)
+ movq %r12,3*8(%rsp)
+ movq %r14,4*8(%rsp)
+ movq %r13,5*8(%rsp)
+ movq %rbp,6*8(%rsp)
+
+ movq %r8,(%rsp)
+ movq %r9,1*8(%rsp)
- pushq %rbx
- pushq %r12
- pushq %r14
- pushq %r15
- movq %r8,%r14
- movq %r9,%r15
movl %ecx,%eax
movl %edx,%ecx
-#ifdef FIX_ALIGNMENT
- /* align source to 8 bytes */
- movl %edi,%r8d
- andl $7,%r8d
- jnz bad_alignment
-after_bad_alignment:
-#endif
-
- movl $64,%r10d
xorl %r9d,%r9d
movq %rcx,%r12
shrq $6,%r12
- /* loopcounter is maintained as one less to test efficiently for the
- previous to last iteration. This is needed to stop the prefetching. */
- decq %r12
- js handle_tail /* < 64 */
- jz loop_no_prefetch /* = 64 + X */
+ jz .Lhandle_tail /* < 64 */
+
+ clc
/* main loop. clear in 64 byte blocks */
- /* tries hard not to prefetch over the boundary */
- /* r10: 64, r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
- .p2align
-loop:
- /* Could prefetch more than one loop, but then it would be even
- trickier to avoid prefetching over the boundary. The hardware prefetch
- should take care of this anyways. The reason for this prefetch is
- just the non temporal hint to avoid cache pollution. Hopefully this
- will be handled properly by the hardware. */
- prefetchnta 64(%rdi)
-
-loop_no_prefetch:
+ /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ .p2align 4
+.Lloop:
source
movq (%rdi),%rbx
source
@@ -104,175 +117,136 @@
source
movq 24(%rdi),%rdx
- dest
- movnti %rbx,(%rsi)
- dest
- movnti %r8,8(%rsi)
- dest
- movnti %r11,16(%rsi)
- dest
- movnti %rdx,24(%rsi)
+ source
+ movq 32(%rdi),%r10
+ source
+ movq 40(%rdi),%rbp
+ source
+ movq 48(%rdi),%r14
+ source
+ movq 56(%rdi),%r13
- addq %rbx,%rax
+ ignore 2f
+ prefetch 5*64(%rdi)
+2:
+ adcq %rbx,%rax
adcq %r8,%rax
adcq %r11,%rax
adcq %rdx,%rax
+ adcq %r10,%rax
+ adcq %rbp,%rax
+ adcq %r14,%rax
+ adcq %r13,%rax
- source
- movq 32(%rdi),%rbx
- source
- movq 40(%rdi),%r8
- source
- movq 48(%rdi),%r11
- source
- movq 56(%rdi),%rdx
+ decl %r12d
dest
- movnti %rbx,32(%rsi)
+ movq %rbx,(%rsi)
dest
- movnti %r8,40(%rsi)
+ movq %r8,8(%rsi)
dest
- movnti %r11,48(%rsi)
+ movq %r11,16(%rsi)
dest
- movnti %rdx,56(%rsi)
+ movq %rdx,24(%rsi)
- adcq %rbx,%rax
- adcq %r8,%rax
- adcq %r11,%rax
- adcq %rdx,%rax
+ dest
+ movq %r10,32(%rsi)
+ dest
+ movq %rbp,40(%rsi)
+ dest
+ movq %r14,48(%rsi)
+ dest
+ movq %r13,56(%rsi)
- adcq %r9,%rax /* add in carry */
+ ignore 3f
+ prefetchw 5*64(%rsi)
+3:
- addq %r10,%rdi
- addq %r10,%rsi
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
- decq %r12
- jz loop_no_prefetch /* previous to last iteration? */
- jns loop
+ jnz .Lloop
+
+ adcq %r9,%rax
/* do last upto 56 bytes */
-handle_tail:
+.Lhandle_tail:
/* ecx: count */
movl %ecx,%r10d
andl $63,%ecx
shrl $3,%ecx
- jz fold
+ jz .Lfold
clc
- movl $8,%edx
-loop_8:
+ .p2align 4
+.Lloop_8:
source
movq (%rdi),%rbx
adcq %rbx,%rax
- dest
- movnti %rbx,(%rsi)
- leaq (%rsi,%rdx),%rsi /* preserve carry */
- leaq (%rdi,%rdx),%rdi
decl %ecx
- jnz loop_8
+ dest
+ movq %rbx,(%rsi)
+ leaq 8(%rsi),%rsi /* preserve carry */
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
adcq %r9,%rax /* add in carry */
-fold:
+.Lfold:
+ /* reduce checksum to 32bits */
movl %eax,%ebx
shrq $32,%rax
- addq %rbx,%rax
+ addl %ebx,%eax
+ adcl %r9d,%eax
/* do last upto 6 bytes */
-handle_7:
+.Lhandle_7:
movl %r10d,%ecx
andl $7,%ecx
shrl $1,%ecx
- jz handle_1
+ jz .Lhandle_1
movl $2,%edx
xorl %ebx,%ebx
clc
-loop_1:
+ .p2align 4
+.Lloop_1:
source
movw (%rdi),%bx
- adcq %rbx,%rax
+ adcl %ebx,%eax
dest
- movw %bx,(%rsi)
- addq %rdx,%rdi
- addq %rdx,%rsi
decl %ecx
- jnz loop_1
- adcw %r9w,%ax /* add in carry */
+ movw %bx,(%rsi)
+ leaq 2(%rdi),%rdi
+ leaq 2(%rsi),%rsi
+ jnz .Lloop_1
+ adcl %r9d,%eax /* add in carry */
/* handle last odd byte */
-handle_1:
+.Lhandle_1:
testl $1,%r10d
- jz ende
+ jz .Lende
xorl %ebx,%ebx
source
movb (%rdi),%bl
dest
movb %bl,(%rsi)
- addw %bx,%ax
- adcw %r9w,%ax /* carry */
+ addl %ebx,%eax
+ adcl %r9d,%eax /* carry */
-ende:
- sfence
- popq %r15
- popq %r14
- popq %r12
- popq %rbx
+.Lende:
+ movq 2*8(%rsp),%rbx
+ movq 3*8(%rsp),%r12
+ movq 4*8(%rsp),%r14
+ movq 5*8(%rsp),%r13
+ movq 6*8(%rsp),%rbp
+ addq $7*8,%rsp
ret
-#ifdef FIX_ALIGNMENT
- /* align source to 8 bytes. */
- /* r8d: unalignedness, ecx len */
-bad_alignment:
- testl $1,%edi
- jnz odd_source
-
- /* compute distance to next aligned position */
- movl $8,%r8d
- xchgl %r8d,%ecx
- subl %r8d,%ecx
-
- /* handle unaligned part */
- shrl $1,%ecx
- xorl %ebx,%ebx
- movl $2,%r10d
-align_loop:
- source
- movw (%rdi),%bx
- addq %rbx,%rax /* carry cannot happen */
- dest
- movw %bx,(%rsi)
- addq %r10,%rdi
- addq %r10,%rsi
- decl %ecx
- jnz align_loop
- jmp after_bad_alignment
-
- /* weird case. need to swap the sum at the end because the spec requires
- 16 bit words of the sum to be always paired.
- handle it recursively because it should be rather rare. */
-odd_source:
- /* copy odd byte */
- xorl %ebx,%ebx
- source
- movb (%rdi),%bl
- addl %ebx,%eax /* add to old checksum */
- adcl $0,%ecx
- dest
- movb %al,(%rsi)
-
- /* fix arguments */
- movl %eax,%ecx
- incq %rsi
- incq %rdi
- decq %rdx
- call csum_partial_copy_generic
- bswap %eax /* this should work, but check */
- jmp ende
-#endif
-
/* Exception handlers. Very simple, zeroing is done in the wrappers */
-bad_source:
- movl $-EFAULT,(%r14)
- jmp ende
-
-bad_dest:
- movl $-EFAULT,(%r15)
- jmp ende
+.Lbad_source:
+ movq (%rsp),%rax
+ movl $-EFAULT,(%rax)
+ jmp .Lende
+
+.Lbad_dest:
+ movq 8(%rsp),%rax
+ movl $-EFAULT,(%rax)
+ jmp .Lende
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)