# $NetBSD: bn_asm_vax.S,v 1.1.1.2 2023/04/18 14:19:11 christos Exp $ # # w.j.m. 15-jan-1999 # # it's magic ... # # ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) { # ULONG c = 0; # int i; # for(i = 0; i < n; i++) := r[i] + c + a[i] * w ; # return c; # } .globl bn_mul_add_words .type bn_mul_add_words@function bn_mul_add_words: .word 0x40 movl 4(%ap),%r2 # *r movl 8(%ap),%r3 # *a movl 12(%ap),%r4 # n movl 16(%ap),%r5 # w clrl %r6 # return value ("carry") 0: emul %r5,(%r3),(%r2),%r0 # w * a[0] + r[0] -> r0 # fixup for "negative" r[] tstl (%r2) bgeq 1f incl %r1 # add 1 to highword 1: # add saved carry to result addl2 %r6,%r0 adwc $0,%r1 # combined fixup for "negative" w, a[] tstl %r5 # if w is negative... bgeq 1f addl2 (%r3),%r1 # ...add a[0] again to highword 1: tstl (%r3) # if a[0] is negative... bgeq 1f addl2 %r5,%r1 # ...add w again to highword 1: movl %r0,(%r2)+ # save low word in dest & advance *r addl2 $4,%r3 # advance *a movl %r1,%r6 # high word in r6 for return value sobgtr %r4,0b # loop? movl %r6,%r0 ret .size bn_mul_add_words, .-bn_mul_add_words # .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 #; #; w.j.m. 15-jan-1999 #; #; it's magic ... #; #; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) { #; ULONG c = 0; #; int i; #; for(i = 0; i < num; i++) := a[i] * w + c ; #; return(c); #; } # .globl bn_mul_words .type bn_mul_words@function bn_mul_words: .word 0x40 movl 4(%ap),%r2 # *r movl 8(%ap),%r3 # *a movl 12(%ap),%r4 # n movl 16(%ap),%r5 # w clrl %r6 # carry 0: emul %r5,(%r3),%r6,%r0 # w * a[0] + carry -> r0 # fixup for "negative" carry tstl %r6 bgeq 1f incl %r1 1: # combined fixup for "negative" w, a[] tstl %r5 bgeq 1f addl2 (%r3),%r1 1: tstl (%r3) bgeq 1f addl2 %r5,%r1 1: movl %r0,(%r2)+ addl2 $4,%r3 movl %r1,%r6 sobgtr %r4,0b movl %r6,%r0 ret .size bn_mul_words, .-bn_mul_words # .title vax_bn_sqr_words unsigned square, 32*32=>64 #; #; w.j.m. 15-jan-1999 #; #; it's magic ... #; #; void bn_sqr_words(ULONG r[],ULONG a[],int n) { #; int i; #; for(i = 0; i < n; i++) := a[i] * a[i] ; #; } # .globl bn_sqr_words .type bn_sqr_words@function bn_sqr_words: .word 0 movl 4(%ap),%r2 # r movl 8(%ap),%r3 # a movl 12(%ap),%r4 # n 0: movl (%r3)+,%r5 # r5 = a[] & advance emul %r5,%r5,$0,%r0 # a[0] * a[0] + 0 -> r0 # fixup for "negative" a[] tstl %r5 bgeq 1f addl2 %r5,%r1 addl2 %r5,%r1 1: movq %r0,(%r2)+ # store 64-bit result sobgtr %r4,0b # loop ret .size bn_sqr_words, .-bn_sqr_words # .title vax_bn_div_words unsigned divide #; #; Richard Levitte 20-Nov-2000 #; #; ULONG bn_div_words(ULONG h, ULONG l, ULONG d) #; { #; return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d); #; } #; #; Using EDIV would be very easy, if it didn't do signed calculations. #; Any time any of the input numbers are signed, there are problems, #; usually with integer overflow, at which point it returns useless #; data (the quotient gets the value of l, and the remainder becomes 0). #; #; If it was just for the dividend, it would be very easy, just divide #; it by 2 (unsigned), do the division, multiply the resulting quotient #; and remainder by 2, add the bit that was dropped when dividing by 2 #; to the remainder, and do some adjustment so the remainder doesn't #; end up larger than the divisor. For some cases when the divisor is #; negative (from EDIV's point of view, i.e. when the highest bit is set), #; dividing the dividend by 2 isn't enough, and since some operations #; might generate integer overflows even when the dividend is divided by #; 4 (when the high part of the shifted down dividend ends up being exactly #; half of the divisor, the result is the quotient 0x80000000, which is #; negative...) it needs to be divided by 8. Furthermore, the divisor needs #; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. #; In this case, a little extra fiddling with the remainder is required. #; #; So, the simplest way to handle this is always to divide the dividend #; by 8, and to divide the divisor by 2 if it's highest bit is set. #; After EDIV has been used, the quotient gets multiplied by 8 if the #; original divisor was positive, otherwise 4. The remainder, oddly #; enough, is *always* multiplied by 8. #; NOTE: in the case mentioned above, where the high part of the shifted #; down dividend ends up being exactly half the shifted down divisor, we #; end up with a 33 bit quotient. That's no problem however, it usually #; means we have ended up with a too large remainder as well, and the #; problem is fixed by the last part of the algorithm (next paragraph). #; #; The routine ends with comparing the resulting remainder with the #; original divisor and if the remainder is larger, subtract the #; original divisor from it, and increase the quotient by 1. This is #; done until the remainder is smaller than the divisor. #; #; The complete algorithm looks like this: #; #; d' = d #; l' = l & 7 #; [h,l] = [h,l] >> 3 #; [q,r] = floor([h,l] / d) # This is the EDIV operation #; if (q < 0) q = -q # I doubt this is necessary any more #; #; r' = r >> 29 #; if (d' >= 0) #; q' = q >> 29 #; q = q << 3 #; else #; q' = q >> 30 #; q = q << 2 #; r = (r << 3) + l' #; #; if (d' < 0) #; { #; [r',r] = [r',r] - q #; while ([r',r] < 0) #; { #; [r',r] = [r',r] + d #; [q',q] = [q',q] - 1 #; } #; } #; #; while ([r',r] >= d') #; { #; [r',r] = [r',r] - d' #; [q',q] = [q',q] + 1 #; } #; #; return q # #;r2 = l, q #;r3 = h, r #;r4 = d #;r5 = l' #;r6 = r' #;r7 = d' #;r8 = q' # .globl bn_div_words .type bn_div_words@function bn_div_words: .word 0x1c0 movl 4(%ap),%r3 # h movl 8(%ap),%r2 # l movl 12(%ap),%r4 # d bicl3 $-8,%r2,%r5 # l' = l & 7 bicl3 $7,%r2,%r2 bicl3 $-8,%r3,%r6 bicl3 $7,%r3,%r3 addl2 %r6,%r2 rotl $-3,%r2,%r2 # l = l >> 3 rotl $-3,%r3,%r3 # h = h >> 3 movl %r4,%r7 # d' = d clrl %r6 # r' = 0 clrl %r8 # q' = 0 tstl %r4 beql 0f # Uh-oh, the divisor is 0... bgtr 1f rotl $-1,%r4,%r4 # If d is negative, shift it right. bicl2 $0x80000000,%r4 # Since d is then a large number, the # lowest bit is insignificant # (contradict that, and I'll fix the problem!) 1: ediv %r4,%r2,%r2,%r3 # Do the actual division tstl %r2 bgeq 1f mnegl %r2,%r2 # if q < 0, negate it 1: tstl %r7 blss 1f rotl $3,%r2,%r2 # q = q << 3 bicl3 $-8,%r2,%r8 # q' gets the high bits from q bicl3 $7,%r2,%r2 brb 2f 1: # else rotl $2,%r2,%r2 # q = q << 2 bicl3 $-4,%r2,%r8 # q' gets the high bits from q bicl3 $3,%r2,%r2 2: rotl $3,%r3,%r3 # r = r << 3 bicl3 $-8,%r3,%r6 # r' gets the high bits from r bicl3 $7,%r3,%r3 addl2 %r5,%r3 # r = r + l' tstl %r7 bgeq 5f bitl $1,%r7 beql 5f # if d' < 0 && d' & 1 subl2 %r2,%r3 # [r',r] = [r',r] - [q',q] sbwc %r8,%r6 3: bgeq 5f # while r < 0 decl %r2 # [q',q] = [q',q] - 1 sbwc $0,%r8 addl2 %r7,%r3 # [r',r] = [r',r] + d' adwc $0,%r6 brb 3b # The return points are placed in the middle to keep a short distance from # all the branch points 1: # movl %r3,%r1 movl %r2,%r0 ret 0: movl $-1,%r0 ret 5: tstl %r6 bneq 6f cmpl %r3,%r7 blssu 1b # while [r',r] >= d' 6: subl2 %r7,%r3 # [r',r] = [r',r] - d' sbwc $0,%r6 incl %r2 # [q',q] = [q',q] + 1 adwc $0,%r8 brb 5b .size bn_div_words, .-bn_div_words # .title vax_bn_add_words unsigned add of two arrays #; #; Richard Levitte 20-Nov-2000 #; #; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) { #; ULONG c = 0; #; int i; #; for (i = 0; i < n; i++) = a[i] + b[i] + c; #; return(c); #; } # .globl bn_add_words .type bn_add_words@function bn_add_words: .word 0 movl 4(%ap),%r2 # r movl 8(%ap),%r3 # a movl 12(%ap),%r4 # b movl 16(%ap),%r5 # n clrl %r0 tstl %r5 bleq 1f 0: movl (%r3)+,%r1 # carry untouched adwc (%r4)+,%r1 # carry used and touched movl %r1,(%r2)+ # carry untouched sobgtr %r5,0b # carry untouched adwc $0,%r0 1: ret .size bn_add_words, .-bn_add_words #; #; Richard Levitte 20-Nov-2000 #; #; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) { #; ULONG c = 0; #; int i; #; for (i = 0; i < n; i++) = a[i] - b[i] - c; #; return(c); #; } # .globl bn_sub_words .type bn_sub_words@function bn_sub_words: .word 0x40 movl 4(%ap),%r2 # r movl 8(%ap),%r3 # a movl 12(%ap),%r4 # b movl 16(%ap),%r5 # n clrl %r0 tstl %r5 bleq 1f 0: movl (%r3)+,%r6 # carry untouched sbwc (%r4)+,%r6 # carry used and touched movl %r6,(%r2)+ # carry untouched sobgtr %r5,0b # carry untouched 1: adwc $0,%r0 ret .size bn_sub_words, .-bn_sub_words # # Ragge 20-Sep-2003 # # Multiply a vector of 4/8 longword by another. # Uses two loops and 16/64 emuls. # .globl bn_mul_comba4 .type bn_mul_comba4@function bn_mul_comba4: .word 0x3c0 movl $4,%r9 # 4*4 brb 6f .globl bn_mul_comba8 .type bn_mul_comba8@function bn_mul_comba8: .word 0x3c0 movl $8,%r9 # 8*8 6: movl 8(%ap),%r3 # a[] movl 12(%ap),%r7 # b[] brb 5f .globl bn_sqr_comba4 .type bn_sqr_comba4@function bn_sqr_comba4: .word 0x3c0 movl $4,%r9 # 4*4 brb 0f .globl bn_sqr_comba8 .type bn_sqr_comba8@function bn_sqr_comba8: .word 0x3c0 movl $8,%r9 # 8*8 0: movl 8(%ap),%r3 # a[] movl %r3,%r7 # a[] 5: movl 4(%ap),%r5 # r[] movl %r9,%r8 clrq (%r5) # clear destinatino, for add. clrq 8(%r5) clrq 16(%r5) # these only needed for comba8 clrq 24(%r5) 2: clrl %r4 # carry movl %r9,%r6 # inner loop count movl (%r7)+,%r2 # value to multiply with 1: emul %r2,(%r3),%r4,%r0 tstl %r4 bgeq 3f incl %r1 3: tstl %r2 bgeq 3f addl2 (%r3),%r1 3: tstl (%r3) bgeq 3f addl2 %r2,%r1 3: addl2 %r0,(%r5)+ # add to destination adwc $0,%r1 # remember carry movl %r1,%r4 # add carry in next emul addl2 $4,%r3 sobgtr %r6,1b movl %r4,(%r5) # save highest add result ashl $2,%r9,%r4 subl2 %r4,%r3 subl2 $4,%r4 subl2 %r4,%r5 sobgtr %r8,2b ret .size bn_mul_comba4, .-bn_mul_comba4